In [1]:
import pandas as pd
import os
import re
import numpy as np
import lxml
import matplotlib.pyplot as plt

import warnings
from pandas.errors import SettingWithCopyWarning
warnings.simplefilter(action="ignore", category=SettingWithCopyWarning)

folder_path = '/Users/bin/Desktop/nls-catalogue-published-material_dc'

file_prefix = 'BIBLIOGRAPHIC_11573881650004341_'

file_end = '.xml'

data = pd.DataFrame()

for i in range(1,52):
    curr_file = file_prefix + str(i) + file_end
    file_path = os.path.join(folder_path, curr_file)

    df = pd.read_xml(file_path)

    data = pd.concat([data, df], ignore_index = True)

data

Unnamed: 0,title,creator,type,publisher,date,language,subject,description,coverage,relation,rights,identifier,format
0,Travel /,"Leeson, Edward,1947-2011.",text,London J. Murray,1980,eng,"Short stories, English.",,,,,,
1,Resource book of test items in chemistry,"Jenkins, E. W.(Edgar William)",text,London Murray,1981,eng,Chemistry,Bibliography: p15,,,,,
2,Arbitration for contractors,"Stephenson, Douglas A.",text,Northwood Books,1982,,,,,,,,
3,Armorial bearings of the sovereigns of England...,Standing Conference for Local History.,text,London Bedford Square Press [for the] Standing...,1977,eng,Heraldry,Bibliography: p.29-31,,,,,
4,"Sharing caring : caring, equal opportunities a...","Thompson, Catherine.",text,Community Care Project,[1985],,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...
5091422,"Symfoni Nr. 5, Opus 50 = Symphony no. 5, opus ...","Fjeldsøe, Michael.edt",notated music,"Copenhagen : Edition Wilhelm Hansen,",1998.,zxx,Symphonies,"""Critical commentary,"" ""Filiation and evaluati...",,,,,
5091423,Music for violin : from 'El Amor brujo' = Musi...,"Falla, Manuel de,1876-1946.Amor brujo.Danza ri...",notated music,"London : Chester Music,",c1996.,zxx,Ballets,Dance of terror (Danza del terror) -- Pantomim...,,,,,
5091424,Fox trot ; and Celebration rag : for SATB reco...,"Hilling, Lyndon.",notated music,"London : Theo Wyatt,",c1984.,eng,,Publ.no.,,,,,
5091425,Fieber : Tondichtung : für Tenor und Orcheste...,"Lehár, Franz1870-1948.",notated music,London Glocken Verlag,[1995],ger,,2827,,,,,


# Time Preprocessing

In [2]:
data_year_undefined = data[data.date.isna() == True]

data = data[data.date.isna() == False]

def extract_four_digit_year(value):
    # Check for the presence of 'c', 'ca', or 'circa' at the beginning
    prefix_match = re.match(r'^(c|ca|circa)', str(value), flags=re.IGNORECASE)
    
    # Extract all digits from the value
    digits = re.findall(r'\d', str(value))
    
    # If no digits found or less than 4 digits, return None
    if not digits or len(digits) < 4:
        return None, "unclear uncertainty"
    
    # Join the digits and convert to integer
    year = int(''.join(digits))
    
    # If exactly 4 digits 
    if len(str(year)) == 4 and int(str(year)) > 1000 and int(str(year)) <= 2023:
        if prefix_match:
            return year, "clear uncertainty"
        if '?' in str(value):
            return year, "clear uncertainty"
        if not prefix_match:
            return year, "certainty"
    
    # If more than 4 digits, return the original value with False
    elif len(str(year)) > 4:
        return value, "unclear uncertainty"
    # Otherwise, return None with False
    else:
        return None, "unclear uncertainty"

    
data['year'], data['certainty'] = zip(*data['date'].apply(extract_four_digit_year))

# retain collections that we are certain about their publication year
data_certain = data[(data.certainty == 'certainty')]
data_clear_uncertain = data[(data.certainty == 'clear uncertainty')]


# Language Preprocessing

In [3]:
# uncertain_unique_languages = pd.unique(data_clear_uncertain['language'])

# diff = [language for language in uncertain_unique_languages if language not in unique_languages]
unique_languages = pd.unique(data_certain['language'])

language_codes = {'eng': 'English','und': 'Undefined','rus': 'Russian','ger': 'German','fre': 'French','swe': 'Swedish',
'spa': 'Spanish','iri': 'Irish','lat': 'Latin','wel': 'Welsh','nor': 'Norwegian','dan': 'Danish','ita': 'Italian','ice': 'Icelandic',
'dut': 'Dutch','cat': 'Catalan','zxx': 'No linguistic content','pol': 'Polish','por': 'Portuguese','gla': 'Scottish Gaelic','fri': 'Frisian',
'ara': 'Arabic','chi': 'Chinese','fin': 'Finnish','grc': 'Ancient Greek','arm': 'Armenian','fro': 'Old French','sco': 'Scots','bre': 'Breton',
'urd': 'Urdu','hun': 'Hungarian','tur': 'Turkish','rum': 'Romanian','gre': 'Greek','hrv': 'Croatian','heb': 'Hebrew','ang': 'Old English',
'gle': 'Irish','mul': 'Multiple languages','jpn': 'Japanese','ukr': 'Ukrainian','slo': 'Slovak','nya': 'Chinyanja','cze': 'Czech','som': 'Somali',
'frm': 'Middle French','vie': 'Vietnamese','mao': 'Maori','per': 'Persian','ind': 'Indonesian','srp': 'Serbian','lit': 'Lithuanian','tah': 'Tahitian',
'lap': 'Sami','roh': 'Romansh','rom': 'Romany','pro': 'Old Occitan','cre': 'Cree','kur': 'Kurdish','geo': 'Georgian','epo': 'Esperanto','pan': 'Punjabi',
'yor': 'Yoruba','inc': 'Indic','slv': 'Slovenian','lav': 'Latvian','enm': 'Middle English','bul': 'Bulgarian','gal': 'Gaelic','lan': 'Occitan',
'chu': 'Church Slavic','kin': 'Kinyarwanda','nic': 'Niger-Kordofanian','afr': 'Afrikaans','may': 'Malay','dum': 'Middle Dutch',
'ssa': 'Nilo-Saharan','suk': 'Sukuma','nep': 'Nepali','bur': 'Burmese','mol': 'Moldavian','wen': 'Sorbian','xho': 'Xhosa',
'fao': 'Faroese','syr': 'Syriac','gmh': 'Middle High German','bos': 'Bosnian','tha': 'Thai','hin': 'Hindi','est': 'Estonian','swa': 'Swahili',
'aze': 'Azerbaijani','cor': 'Cornish','kor': 'Korean','tem': 'Timne','ven': 'Venda','afa': 'Afro-Asiatic','tog': 'Tonga',
'kac': 'Kachin','bem': 'Bemba','tam': 'Tamil','nob': 'Norwegian Bokmål','max': 'Manx','baq': 'Basque','snd': 'Sindhi',
'san': 'Sanskrit','amh': 'Amharic','ben': 'Bengali','esp': 'Esperanto','yao': 'Yao','cpp': 'Creoles and pidgins','scc': 'Serbo-Croatian',
'ijo': 'Ijo','sla': 'Slavic','lug': 'Ganda','fat': 'Fanti','alb': 'Albanian','hau': 'Hausa','fry': 'Western Frisian','mac': 'Macedonian',
'tum': 'Tumbuka','esk': 'Eskimo-Aleut','son': 'Songhai','lad': 'Ladino','mis': 'Miscellaneous languages','sho': 'Shona','nub': 'Nubian',
'syc': 'Classical Syriac','gaa': 'Ga','ota': 'Ottoman Turkish','haw': 'Hawaiian','kik': 'Kikuyu','jav': 'Javanese','snh': 'Shona','bel': 'Belarusian',
'kon': 'Kongo','sag': 'Sango','sso': 'Sotho, Southern','guj': 'Gujarati','glv': 'Manx','cpe': 'Creoles and pidgins, English-based','sal': 'Salishan',
'hat': 'Haitian Creole','map': 'Austronesian','glg': 'Galician','lao': 'Lao','pli': 'Pali','ibo': 'Igbo','nno': 'Norwegian Nynorsk',
'cop': 'Coptic','mar': 'Marathi','tvl': 'Tuvaluan','kaa': 'Kara-Kalpak','twi': 'Twi','nyn': 'Norwegian Nynorsk','loz': 'Lozi','kua': 'Kuanyama',
'mas': 'Masai','ewe': 'Ewe','mlt': 'Maltese','pus': 'Pushto','alg': 'Algonquian','kro': 'Kru','sin': 'Sinhala','btk': 'Batak',
'gem': 'Germanic','bas': 'Basa','luo': 'Luo (Kenya and Tanzania)','sit': 'Sino-Tibetan','dak': 'Dakota','tag': 'Tagalog',
'mla': 'Malayalam','roa': 'Romance','mon': 'Mongolian','nde': 'Ndebele, North','yid': 'Yiddish','akk': 'Akkadian',
'cad': 'Caddo','kab': 'Kabyle','lun': 'Lunda','run': 'Rundi','mal': 'Malayalam','tib': 'Tibetan','far': 'Farsi','egy': 'Egyptian (Ancient)',
'myn': 'Mayan','ton': 'Tonga','fiu': 'Finno-Ugrian','den': 'Slave (Athapascan)',
'efi': 'Efik','tsi': 'Tsimshian','bnt': 'Bantu','din': 'Dinka','oji': 'Ojibwa','zul': 'Zulu','uzb': 'Uzbek','nyo': 'Nyoro',
'sgn': 'Sign languages','ach': 'Acholi','jpr': 'Judeo-Persian','sus': 'Susu','tsw': 'Tswana','nym': 'Nyamwezi','kan': 'Kannada',
'fan': 'Fang','nzi': 'Nzima','mlg': 'Malagasy','nah': 'Nahuatl','lub': 'Luba-Katanga','men': 'Mende','kaz': 'Kazakh',
'asm': 'Assamese','gwi': "Gwich'in",'art': 'Artificial languages','oci': 'Occitan (post 1500)','fij': 'Fijian','lin': 'Lingala',
'bam': 'Bambara','tso': 'Tsonga','eth': 'Ethiopian','nau': 'Nauru','tel': 'Telugu','nai': 'North American Indian','kam': 'Kamba',
'paa': 'Papuan','pau': 'Palauan','ath': 'Athapascan','tir': 'Tigrinya','niu': 'Niuean','tut': 'Altaic',
'gag': 'Gallegan','car': 'Carib','non': 'Old Norse','iro': 'Iroquoian','gua': 'Guaraní','sai': 'South American Indian',
'cel': 'Celtic','wak': 'Wakashan','arc': 'Official Aramaic (700-300 BCE)','del': 'Delaware','sux': 'Sumerian','que': 'Quechua',
'int': 'Interlingua','sam': 'Samaritan Aramaic','hai': 'Haida','alt': 'Southern Altai','rar': 'Rarotongan','mic': 'Micmac',
' ge': 'Georgian','sna': 'Shona','cpf':'Creoles and pidgins, French-based','gon': 'Gondi','ber': 'Berber','elx': 'Elamite',
'ng|': 'Unknown','crp': 'Creoles and pidgins','dyu': 'Dyula','kru': 'Kurukh','man': 'Mandingo','kas': 'Kashmiri','nso': 'Pedi','chr': 'Cherokee','ori': 'Oriya',
'gil': 'Gilbertese','pal': 'Pahlavi','bua': 'Buriat','umb': 'Umbundu','sga': 'Old Irish','tgl': 'Tagalog',
'osa': 'Osage','gez': 'Geez','tar': 'Tatar','ilo': 'Iloko','ave': 'Avestan','wol': 'Wolof',
'peo': 'Old Persian','ine': 'Indo-European','him': 'Western Pahari','got': 'Gothic','sao': 'Sao','ira': 'Iranian',
'iku': 'Inuktitut','cho': 'Choctaw','lam': 'Lamba','cus': 'Cushitic','tiv': 'Tiv','kau': 'Kanuri',
'goh': 'Old High German','ful': 'Fulah','ada': 'Adangme','chg': 'Chagatai','fur': 'Friulian','pra': 'Prakrit',
'cam': 'Khmer','moh': 'Mohawk','ceb': 'Cebuano','er|': 'Unknown','bla': 'Siksika',
'wal': 'Walamo','mkh': 'Mon-Khmer','dua': 'Duala','ta|': 'Tamil','lol': 'Mongo',
'|en': 'English','kha': 'Khasi','ndo': 'Ndonga','pag': 'Pangasinan',
'ban': 'Balinese','taj': 'Tajik','aym': 'Aymara','bad': 'Banda','shn': 'Shan',
'hmn': 'Hmong','chn': 'Chinook jargon','bik': 'Bikol','at|': 'Austroasiatic',
' ': 'Unknown','bal': 'Baluchi','khi': 'Khoisan','kpe': 'Kpelle',
'her': 'Herero',  '   ': 'Unknown', ' en': 'English','d': 'Unknown', 
'tsn':'Tswana', 'khm':'Khmer', 'mus':'Creek', 'ng ': 'Unknown', 'sem':'Semitic', 'tmh':'Tamashek', 'lah':'Lahnda', 'kir':'Kirghiz'              
}

data_certain['language'] = data_certain['language'].map(language_codes)

data_clear_uncertain['language'] = data_clear_uncertain['language'].map(language_codes) 

europe= ["English", "Russian", "German", "French", "Swedish", "Spanish", "Irish", "Latin", "Welsh", "Norwegian", "Danish", "Italian", 
         "Icelandic", "Dutch", "Catalan", "Polish", "Portuguese", "Scottish Gaelic", "Frisian", "Finnish", "Ancient Greek", "Old French", 
         "Scots", "Breton", "Hungarian", "Romanian", "Greek", "Croatian", "Old English", "Ukrainian", "Slovak", "Czech", "Middle French", 
         "Serbian", "Lithuanian", "Romansh", "Old Occitan", "Georgian",  "Slovenian", "Latvian", "Middle English", "Bulgarian", 
         "Occitan", "Church Slavic", "Middle Dutch", "Moldavian", "Sorbian", "Faroese", "Middle High German", "Bosnian", "Estonian", 
         "Cornish", "Norwegian Bokmål", "Amex", "Sami", "Basque", "Serbo-Croatian", "Albanian", "Western Frisian", "Macedonian", "Ladino", "Belarusian", 
         "Galician", "Norwegian Nynorsk", "Germanic", "Romance", "Yiddish", "Wallow", "Finno-Ugrian", "Gallegan", "Old Norse", "Old Irish", "Gothic",
         "Sao", "Old High German", "Fruition", "Celtic"]


asia = ["Arabic", "Chinese", "Armenian", "Urdu",  "Turkish", "Hebrew", "Japanese", "Vietnamese", "Persian", "Indonesian", "Kurdish", "Punjabi",
        "Indic", "Malay", "Nepali", "Burmese", "Syriac", "Thai", "Hindi", "Azerbaijani", "Korean", "Kachin", "Tamil", "Sindhi", "Sanskrit", 
        "Bengali", "Classical Syriac", "Ottoman Turkish", "Javanese", "Austronesian", "Lao", "Pali", "Marathi", "Kara-Kalpak", 
        "Maltese", "Pushto", "Sinhala", "Batak", "Sino-Tibetan", "Tagalog", "Malayalam", "Mongolian", "Akkadian", "Tibetan", "Farsi",
        "Uzbek", "Judeo-Persian", "Kannada", "Nahuatil", "Kazakh", "Assamese", "Telugu", "Altaic", "Khmer", 
        "Official Aramaic (700-300 BCE)", "Sumerian", "Samaritan Aramaic", "Southern Altai", "Gondi", "Elamite", "Pahlavi", "Kashmiri", "Oriya", 
         "Kirghiz", "Bruita", "Tartar", "Ilo", "Avestan", "Old Persian", "Western Pahari", "Iranian", 
        "Chagatai",  "Prakrit", "Cebuano", "Mon-Khmer", "Semitic", "Khasi", "Pangasinan", "Balinese", 
        "Tajik", "Shan", "Hmong", "Bikol", "Lahnda", "Baluchi", "Austroasiatic"]


africa = ["Chinyanja", "Somali", "Yoruba","Kinyarwanda", "Niger-Kordofanian", "Afrikkans", "Nilo-Saharan", "Suzuka", "Xhosa", "Swahili", "Timne",
          "Venda",  "Bemba", "Amharic", "Yao", "Ijo", "Ganda", "Fanti", "Hausa", "Tumbuka", "Songhai", "Shona", 
          "Nubian", "Ga", "Kikuyu", "Kongo", "Sango", "Sotho, Southern", "Gujarati", "Coptic", "Twi", "Lozi", "Kuanyama", "Ewe", "Kru", "Basa",
          "Luo", "Ndebele, North", "Kabyle", "Lunda", "Rundi", "Egyptian (Ancient)", "Efik", "Bantu", "Dinka", "Zulu", "Nyoro", "Acholi", "Susu",
          "Tswana", "Nyamwezi", "Fang", "Nzima", "Malagasy", "Luba-Katanga", "Mende", "Lingala", "Bambara", "Tsonga", "Ethiopian", "Kamba", 
          "Tigrinya", "Berber", "Dyula", "Kurukh", "Mandingo", "Pedi", "Umbundu", "Geez", "Igbo", "Wolof", "Cushitic", 
          "Tiv", "Kanuri", "Fulah", "Adangme", "Walamo", "Duala", "Mongo", "Ndonga", "Tamashek", "Banda", "Khoisan", "Knell", 'Herero' 
          ] 



north_america = ["Cree", "Eskimo-Aleut", "Hawaiian", "Salishan", "Haitian Creole", "Algonquian", "Dakota", "Caddo", "Mayan", "Slave (Athapascan)",
                 "Tsimshian", "Ojibwa", "Gwich’in", "North American Indian", "Athapaskan", "Iroquoian", "Wakashan", "Delaware", "Haida", "Micmac",
                 "Cherokee", "Osage", "Inuktitut", "Choctaw", "Mohawk", "Siksika", "Chinook Jargon", "Creek"]


south_america = ["Carib", "Guaraní", "South American Indian", "Quechua", "Aymara"] 


oceania = ["Maori", "Tahitian,Tonga", "Tuvaluan", "Fijian", "Nauru", "Papuan", "Palauan",  "Niuean", "Rarotonga", "Gilbertese"] 


unknown = ["Unknown","Undefined", "No linguistic content", "Multiple languages","Romany", "Esperanto", "Afro-Asiatic", "Creoles and pidgins",
           "Miscellaneous Languages", "Creoles and pidgins, English-based", "Sign languages", "Artificial languages", "Interlingua", 
           "Creoles and pidgins, French-based", "Indo-European", "Lamba"]

def assign_continent(language):
    if language in europe:
        return "Europe"
    elif language in asia:
        return "Asia"
    elif language in africa:
        return "Africa"
    elif language in north_america:
        return "North America"
    elif language in oceania:
        return "Oceania"
    elif language in unknown or language is np.NaN:
        return "Unknown"
    elif language in south_america:
        return "South America"

data_certain['continent'] = data_certain['language'].apply(assign_continent)
data_clear_uncertain['continent'] = data_clear_uncertain['language'].apply(assign_continent)

data_clear_uncertain

Unnamed: 0,title,creator,type,publisher,date,language,subject,description,coverage,relation,rights,identifier,format,year,certainty,continent
12,The tinder box /,"Spenceley, Annabel.",text,Loughborough Ladybird,c1984,English,,Ill on lining papers,,,,,,1984,clear uncertainty,Europe
19,Study guide and review manual of human embryology,"Moore, Keith L.",text,Philadelphia London Saunders,c1982,English,Fetus,Previous ed.: 1976,,,,,,1982,clear uncertainty,Europe
33,The state in Burma,"Taylor, R. H.(Robert Henry),1943-",text,London Hurst,c1987,English,,Includes bibliography and index,,,,,,1987,clear uncertainty,Europe
36,Well testing in heterogeneous formations,"Streltsova, Tatiana D.",text,Wiley,c1988,English,,,,,,,,1988,clear uncertainty,Europe
37,Pet birds for home and garden /,"Harper, Don.",text,"London : Salamander,",c1986.,English,,,,,,,,1986,clear uncertainty,Europe
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5091420,The snowman suite for cello and piano,"Gout, Alan.",notated music,London Highbridge Music Faber Music,c1996,No linguistic content,,c960723,,,,,,1996,clear uncertainty,Unknown
5091421,Motet for double choir : Ich lasse dich nicht /,"Hersom, Herbert.",notated music,"Wheaton Aston : Hawthorns Music,",c1995.,No linguistic content,,,,,,,,1995,clear uncertainty,Unknown
5091423,Music for violin : from 'El Amor brujo' = Musi...,"Falla, Manuel de,1876-1946.Amor brujo.Danza ri...",notated music,"London : Chester Music,",c1996.,No linguistic content,Ballets,Dance of terror (Danza del terror) -- Pantomim...,,,,,,1996,clear uncertainty,Unknown
5091424,Fox trot ; and Celebration rag : for SATB reco...,"Hilling, Lyndon.",notated music,"London : Theo Wyatt,",c1984.,English,,Publ.no.,,,,,,1984,clear uncertainty,Europe


In [4]:
# fdist = FreqDist(types)
# fdist = dict(fdist.most_common(101))

# sum(fdist.values())

# Type Preprocessing

In [5]:
def type_preprocess(key):
    # Keywords to be removed if found at the end of an element
    # keywords = [".lcgft",".rgenbr", ".gsafd", ".lcsh", ".rvmgf", ".aat",
    #             ".rbprov", ".lcsh.",".gsafd.", ".rbgenr","rbenr", "gsafd", "lcsh", "rvmgf", "rbgenr",
    #            ".fast(ocolc)fst01411641", ".fast(ocolc)fst01411628", ".fast(ocolc)fst01423787"]

    if key is not None:
        key = key.lower()

        if 'périodiques' in key:
            key = 'periodical'
        if 'ressource internet' in key:
            key = 'Internet resource'
    
        # for keyword in keywords:
        #     if key.endswith(keyword):
        #         key = key[:-len(keyword)]
        #         break
    return key

# fdist = {type_preprocess(key): value for key, value in fdist.items()}

# fdist.keys()

In [6]:
categories_1500 = {
    'text': {
        'children':{
        'incunabula':{}
    }
    },
    'visual media': {
            'children':{'cartographic':{}, 'wooden board':{}}
                 
    },
    
    'miscellaneous':{
        'children':{ 'indulgence':{},  'broadside': {}, 'almanac': {}, 'calandar':{}, 'binding':{}      
        }
    },

}

categories_1550 = {
    'text': {
        'children':{
        'incunabula':{}, 'legal works':{}, 'book':{},'annotation':{}, 'marginalia':{},'inscription':{}     
    }
    },
    'visual media': {
            'children':{'cartographic':{},'device':{}, 'still image':{}, 'wooden board':{}}
                 
    },
    'audio': {
        'children': { 'notated music': {},  'sermon':{}
        }
    },
    'miscellaneous':{
        'children':{ 'indulgence':{}, 'calendar':{}, 'broadside': {}, 'proclamation': {}, 'almanac': {},'chapbook':{}, 'binding':{}
                    
        }
    }
}

categories_1600 = {
    'text': {
        'children':{
        'incunabula':{}, 'legal works':{}, 'book':{}, 'marginalia':{}, 
        'poem':{}, 'inscription':{}, 'apologetics':{}, 'grammar':{}, 'catechism':{}
    }
    },
    'visual media': {
            'children':{'cartographic':{},'device':{}, 'still image':{}, 'wooden board':{}, 'painting':{}}
                 
    },
    'audio': {
        'children': { 'notated music': {}, 'verse':{}, 'ballad':{}, 'sermon':{}
        }
    },
    'miscellaneous':{
        'children':{ 'indulgence':{}, 'calendar':{}, 'broadside': {}, 'proclamation': {}, 'almanac': {}, 
                     'chapbook':{}, 'early works':{}, 'printed waste':{}, 'binding':{}
                    
        }
    },
    'undefined': {}
}

categories_1650 = {
    'text': {
        'children':{
        'incunabula':{}, 'legal works':{}, 'newspaper':{}, 'book':{}, 'dissertation':{},'poem':{}, 'satire':{}, 'annotation':{},
        'religious works':{}, 'inscription':{}, 'periodical':{}, 'legislative proceeding':{}
            
    }
    },
    'visual media': {
            'children':{'cartographic':{},'device':{}, 'still image':{}, 'wooden board':{}, 'painting':{}, 'decorated paper':{}}
                 
    },
    'audio': {
        'children': { 'notated music': {}, 'verse':{}, 'ballad':{}, 'presentation':{}, 'sermon':{}
        }
    },
        'IT':{
            'children':{ 'software':{}}
        },
    'miscellaneous':{
        'children':{ 'indulgence':{}, 'broadside': {}, 'proclamation': {}, 'almanac': {}, 'address':{}, 'chapbook':{},
                     'binding':{}, 'label':{}, 'bookplate':{}, 'early works':{}, 'marking':{}, 'play':{}                
        }
    },
    'undefined': {}
}

categories_1700 = {
    'text': {
        'children':{
        'incunabula':{}, 'legal works':{}, 'newspaper':{}, 'book':{}, 'dissertation':{},'poem':{}, 'satire':{}, 'annotation':{},
        'periodical':{}, 'regulation':{},'grammar':{},'session laws':{}
            
    }
    },
    'visual media': {
            'children':{'cartographic':{},'still image':{}, 'wooden board':{}, 'painting':{}, 'decorated paper':{}}
                 
    },
    'audio': {
        'children': { 'notated music': {}, 'verse':{},'ballad':{}, 'presentation':{}, 'narratives':{}, 'sermon':{}, 'elegies':{}
        }
    },
    'miscellaneous':{
        'children':{ 'indulgence':{}, 'broadside': {}, 'proclamation': {}, 'almanac': {}, 'chapbook':{}, 'address':{},
                     'binding':{}, 'label':{}, 'catalog':{},'bookplate':{}, 'early works':{}, 'play':{}, 'comedies':{}               
        }
    },
    'undefined': {}
}

categories_1750 = {
    'text': {
        'children':{
        'incunabula':{}, 'legal works':{}, 'newspaper':{}, 'book':{}, 'dissertation':{},'poem':{}, 'annotation':{},
         'periodical':{}, 'letter':{}, 'prospectus':{}, 'libretto':{}
    }
    },
    'visual media': {
            'children':{'cartographic':{}, 'still image':{}, 'wooden board':{}, 'painting':{}, 'three dimensional object':{}
                       
                       }           
    },
    'audio': {
        'children': { 'notated music': {}, 'verse':{},'ballad':{}, 'narratives':{}, 'speech':{}, 'sermon':{}
        }
    },
    'miscellaneous':{
        'children':{ 'indulgence':{}, 'broadside': {}, 'proclamation': {}, 'almanac': {}, 'chapbook':{},
                     'address':{},'binding':{}, 'label':{}, 'early works':{}, 'advertisement':{},

                    'account':{}, 'bookplate':{}, 'comedies':{},'play':{}, 'catalog':{}               
        }
    },
    'undefined': {}
}

categories_1800 = {
    'text': {
        'children':{
         'legal works':{}, 'newspaper':{},'book':{}, 'dissertation':{},'poem':{},'annotation':{},
         'periodical':{}, 'letter':{}, 'prospectus':{}
    }
    },
    'visual media': {
            'children':{'cartographic':{}, 'still image':{}, 'wooden board':{}, 'three dimensional object':{}                       
                       }
                 
    },
    'audio': {
        'children': { 'notated music': {}, 'verse':{},'ballad':{}, 'speech':{}, 'sermon':{}
        }
    },
    'miscellaneous':{
        'children':{ 'indulgence':{}, 'broadside': {}, 'proclamation': {}, 'almanac': {}, 'chapbook':{},
                     'address':{},'binding':{}, 'label':{}, 'advertisement':{},

                    'account':{}, 'bookplate':{}        
        }
    },
    'undefined': {}
}

categories_1850 = {
    'text': {
        'children':{
         'legal works':{}, 'book':{}, 'dissertation':{},'poem':{},'annotation':{},
         'periodical':{}, 'letter':{}
    }
    },
    'visual media': {
            'children':{'cartographic':{}, 'still image':{}, 'wooden board':{}, 'poster':{},   

                        'playbill':{}
                       }
                 
    },
    'audio': {
        'children': { 'notated music': {}, 'verse':{},'ballad':{}, 'sermon':{}
        }
    },
    'miscellaneous':{
        'children':{ 'indulgence':{}, 'broadside': {}, 'proclamation': {}, 'almanac': {}, 'chapbook':{},
                     'address':{},  'advertisement':{},

                    'account':{},       
        }
    },
    'undefined': {}
}

categories_1900 = {
    'text': {
        'children':{
         'legal works':{}, 'book':{}, 'dissertation':{},'poem':{},'annotation':{},
         'periodical':{}, 'journal':{}, 'fiction':{}, 'history':{},'letter':{}
    }
    },
    'visual media': {
            'children':{'cartographic':{}, 'still image':{}, 'wooden board':{}, 'poster':{},   
                        'playbill':{}, 'three dimensional object':{}
                       }
                 
    },
    'audio': {
        'children': { 'notated music': {}, 'verse':{},'ballad':{}, 'sermon':{}
        }
    },
    'miscellaneous':{
        'children':{ 'indulgence':{}, 'broadside': {}, 'proclamation': {}, 'almanac': {}, 'chapbook':{},
                     'address':{}, 'account':{}, 'theater program':{},
                    
                    'advertisement':{}                          
        }
    },
    'undefined': {}
}

categories_1950 = {
    'text': {
        'children':{
         'book':{}, 'dissertation':{},'poem':{},'annotation':{},
         'periodical':{}, 'journal':{}, 'fiction':{}, 'history':{},'letter':{}
    }
    },
    'visual media': {
            'children':{'cartographic':{}, 'still image':{}, 'wooden board':{}, 'poster':{}, 'three dimensional object':{},

                        'playbill':{}
                       }
                 
    },
    'audio': {
        'children': { 'notated music': {}, 'recording':{},'verse':{},'ballad':{}, 'sermon':{}
        }
    },
    'miscellaneous':{
        'children':{ 'indulgence':{}, 'broadside': {}, 'proclamation': {}, 'almanac': {}, 'chapbook':{},
                     'address':{}, 'account':{}, 'theater program':{},'advertisement':{}                          
        }
    },
    'undefined': {}
}

categories_2000 = {
    'text': {
        'children':{
         'book':{}, 'newspaper':{},'poem':{},'annotation':{}, 'stories':{}, 'reader':{},
         'periodical':{}, 'journal':{}, 'fiction':{}, 'history':{}, 'bildungsromane':{}, 'tale':{}
    }
    },
    'visual media': {
            'children':{'cartographic':{}, 'moving image':{},'still image':{}, 'poster':{}, 'three dimensional object':{},
                       }
                 
    },
    'audio': {
        'children': { 'notated music': {}, 'recording':{},'ballad':{}
        }
    },
    'IT': {
        'children': { 'software':{}, 'Internet resource':{}
        }
    },
    'miscellaneous':{
        'children':{ 'broadside': {}, 'proclamation': {}, 'almanac': {}, 'chapbook':{},
                      'theater program':{}                           
        }
    },
    'undefined': {}
}

categories_2022 = {
    'text': {
        'children':{
        'book': {}, 'stories': {}, 'fiction': {}, 'tale': {}, 'novel':{}, 'annotation':{},
             'history': {}, 'bildungsromane': {}, 'reader': {}, 'journal': {}, 'periodical': {} #, 'périodique': {}
    }
    },
    'visual media': {
            'children':{'three dimensional object': {},'cartographic': {},'moving image': {}, 'still image': {}, 'poster': {}}              
    },
    'audio': {
        'children': {
            'recording': {}, 'notated music': {}, 'ballad': {}
        }
    },
    'IT': {
        'children': {
            'software':{}, 'Internet resource':{}
        }
    },
    'miscellaneous':{
        'children':{
            'broadside': {}, 'proclamation': {}, 'almanac': {}, 'theater program': {}, 'chapbook':{}
        }
    },
    'undefined': {}
}


In [7]:
def merge_dictionaries(master, addition):
    for key, value in addition.items():
        # If the key doesn't exist in the master, add it
        if key not in master:
            master[key] = value
        else:
            # If both the master and the addition have 'children', merge them
            if 'children' in value and 'children' in master[key]:
                merge_dictionaries(master[key]['children'], value['children'])
            # If the 'children' attribute doesn't exist in the addition, simply pass
            # This would be the case for 'undefined' or other leaf categories
            elif 'children' not in value:
                pass
            else:
                # If 'children' exists in the master but not in the addition,
                # This implies a structure change, handle accordingly, possibly
                # merging the non-children values or overwriting the key.
                # The specific action depends on your desired data structure outcome.
                pass
    return master

# Initialize the master categories dictionary
master_categories = {}

# List of all categories dictionaries
all_categories = [categories_1500, categories_1550, categories_1600, categories_1650,
                  categories_1700, categories_1750, categories_1800, categories_1850,
                  categories_1900, categories_1950, categories_2000, categories_2022]


# Merge all categories into the master_categories dictionary
for categories in all_categories:
    master_categories = merge_dictionaries(master_categories, categories)

# Now master_categories contains all the merged information
master_categories


{'text': {'children': {'incunabula': {},
   'legal works': {},
   'book': {},
   'annotation': {},
   'marginalia': {},
   'inscription': {},
   'poem': {},
   'apologetics': {},
   'grammar': {},
   'catechism': {},
   'newspaper': {},
   'dissertation': {},
   'satire': {},
   'religious works': {},
   'periodical': {},
   'legislative proceeding': {},
   'regulation': {},
   'session laws': {},
   'letter': {},
   'prospectus': {},
   'libretto': {},
   'journal': {},
   'fiction': {},
   'history': {},
   'stories': {},
   'reader': {},
   'bildungsromane': {},
   'tale': {},
   'novel': {}}},
 'visual media': {'children': {'cartographic': {},
   'wooden board': {},
   'device': {},
   'still image': {},
   'painting': {},
   'decorated paper': {},
   'three dimensional object': {},
   'poster': {},
   'playbill': {},
   'moving image': {}}},
 'miscellaneous': {'children': {'indulgence': {},
   'broadside': {},
   'almanac': {},
   'calandar': {},
   'binding': {},
   'calendar': {

In [8]:
from collections import deque
from nltk import FreqDist


def find_deepest_category(entry, categories):
    if entry is None:
        return 'Undefined'

    entry = entry.lower()  # Convert the entry to lowercase for case-insensitive matching
    queue = deque([({'children': categories}, [])])  # Initialize the queue
    deepest_match = ['Undefined']
    longest_match_length = 0  # Track the length of the longest match

    while queue:
        current_item, path = queue.popleft()
        current_level = current_item.get('children', {})

        for category, details in current_level.items():
            category_lower = category.lower()  # Convert category name to lowercase for case-insensitive matching            
            # Check if the category name is a substring of the entry
            if category_lower in entry:
                match_length = len(category_lower)
                # Update the deepest match if this match is longer than any previous match
                if match_length > longest_match_length:
                    deepest_match = path + [category]
                    longest_match_length = match_length

            # If there are children, add them to the queue to explore further
            if 'children' in details:
                queue.append((details, path + [category]))
    
    return deepest_match[0]


def create_dataframe(dataframe, categories, year):
    
    dataframe = dataframe[dataframe.year <= year]
    
    types = dataframe['type'].tolist()
    
    if len(types) < 101:
        fdist = FreqDist(types)
        fdist = dict(fdist.most_common(len(types)))
    else:
        fdist = FreqDist(types)
        fdist = fdist.most_common(101)[1:]  # Skipping the first entry

    most_common_types = [type_preprocess(type_name) for type_name, count in fdist]
    
    # Filter the dataframe for rows where the type is in the most common types
    filtered_dataframe = dataframe[dataframe['type'].apply(type_preprocess).isin(most_common_types)]
    filtered_dataframe['category'] = filtered_dataframe['type'].apply(lambda x: find_deepest_category(x, categories))
    
    # Adding new columns for continent categorization
    def categorize_continent(continent):
        if continent == 'Europe':
            return 'Europe'
#         elif continent == 'Unknown':
#             return 'Unknown'
        else:
            return 'Other'
    
    def continent_subcategory(continent):
        if continent == 'Europe':
            return ''  # Or any other placeholder you see fit
        else:
            return continent

    filtered_dataframe['continent_0'] = filtered_dataframe['continent'].apply(categorize_continent)
    filtered_dataframe['continent_1'] = filtered_dataframe['continent'].apply(continent_subcategory)

    # Group by the new hierarchy levels and count occurrences
    result_dataframe = filtered_dataframe.groupby(['continent_0', 'continent_1', 'category']).size().reset_index(name='counts')
    return result_dataframe

def adjust_dataframe_for_plotting(df):
    # Modify continent_1 where it should not create a segment in the sunburst
    mask = df['continent_0'].isin(['Europe', 'Unknown'])
    df.loc[mask, 'continent_1'] = None  # Setting to None so that Plotly skips this in the visualization
    return df


In [9]:
# Return the total number of collections for each continent of every 50 years
def count_collections(d):
    small_data = d[['type','language','continent','year','certainty']]


    collections = []

    for gap in range(1450, 2050, 50):
        europe_collection = small_data[(small_data.continent == 'Europe') & (small_data.year > gap) & (small_data.year <= gap + 50)]
        asia_collection = small_data[(small_data.continent == 'Asia') & (small_data.year > gap) & (small_data.year <= gap + 50)]
        africa_collection = small_data[(small_data.continent == 'Africa') & (small_data.year > gap) & (small_data.year <= gap + 50)]
        north_america_collection = small_data[(small_data.continent == 'North America') & (small_data.year > gap) & (small_data.year <= gap + 50)]
        oceania_collection = small_data[(small_data.continent == 'Oceania') & (small_data.year > gap) & (small_data.year <= gap + 50)]
        south_america_collection = small_data[(small_data.continent == 'South America') & (small_data.year > gap) & (small_data.year <= gap + 50)]
        unknown_collection = small_data[(small_data.continent == 'Unknown') & (small_data.year > gap) & (small_data.year <= gap + 50)]

        collections.append([len(europe_collection), len(unknown_collection), len(asia_collection), len(africa_collection), len(north_america_collection),
                        len(oceania_collection), len(south_america_collection)])
    collections = np.array(collections)



    collections[0] = np.array(collections[0])

    for i in range(len(collections)-1):
        collections[i+1] = collections[i] + collections[i+1]
    
    return collections

certain_collections = count_collections(data_certain)
clear_uncertain_collections = count_collections(data_clear_uncertain)
total_collections = certain_collections + clear_uncertain_collections


In [10]:
import matplotlib.pyplot as plt
import squarify
import plotly.express as px
import ipywidgets as widgets
from IPython.display import display, clear_output
import matplotlib.patches as mpatches
from IPython.display import display, HTML, clear_output

display(HTML("<style>.output {height: auto !important;}</style>"))


time_periods = ["1500", "1550", "1600", "1650", "1700",
                "1750", "1800", "1850", "1900", "1950",
                "2000", "2022"]

continents = ["Unknown", "Asia", "Africa", "North America", "Oceania", "South America"]

# Define colors corresponding to the continents in the same order
color_map = {
    'Unknown': 'mediumorchid',
    'Asia': 'gold',
    'Africa': 'red',
    'North America': 'darkorange',
    'Oceania': 'limegreen',
    'South America': 'sienna'
}

# Define color maps
category_color_map = {
    'text': '#636EFA',          # blue
    'audio': '#EF553B',        # red
    'visual media': '#00CC96', # teal
    'miscellaneous': '#AB63FA',# purple
    'IT': '#FFA15A',           # orange
    'undefined': '#FF6692'     # pink
}

continent_color_map = {
    'Europe': '#B6E880',
    'Unknown': '#FF97FF',
    'Other': '#FECB52',
    'Asia': '#FFA15A',
    'Africa': '#EF553B',
    'North America': '#19D3F3',
    'Oceania': '#FF6692',
    'South America': '#B6E880'
}


# Define colors corresponding to the continents in the same order
color_map = {
    'Unknown': 'mediumorchid',
    'Asia': 'gold',
    'Africa': 'red',
    'North America': 'darkorange',
    'Oceania': 'limegreen',
    'South America': 'sienna'
}

def plot_treemap(index, collections, title_suffix):
    
    # Dynamically set colors based on available categories
    color_map = {
        'Asia': 'gold',
        'Africa': 'red',
        'North America': 'darkorange',
        'South America': 'sienna',
        'Oceania': 'limegreen',
        'Unknown': 'mediumorchid'
        
    }
    
    plt.figure(figsize=(25, 18))
    ax = plt.gca()
    period_data = collections[index]
    sizes = period_data[period_data > 0]  # Filter out zero sizes
    # Only include labels for non-zero sizes
    labels = [str(size) for size in sizes if size > 0]
    # Map colors based on the available sizes
    colors = [color_map[continents[i]] for i, size in enumerate(period_data) if size > 0]
    
    # Determine which continents are present based on the index positions
    present_continent_indices = [i for i, size in enumerate(collections[index]) if size > 0]
    colors = [color_map[continents[i]] for i in present_continent_indices]
    
    squarify.plot(sizes=sizes, color=colors, label=labels, alpha=0.6, ax=ax)
    ax.set_title(f"Treemap for {title_suffix} by {time_periods[index]}", fontsize=20)
    
    for text in ax.texts:
        text.set_fontsize(15)
    
    ax.axis('off')
    
    # Create a legend with the colors used
    legend_labels = {continents[i]: color_map[continents[i]] for i in present_continent_indices}
    legend_handles = [mpatches.Patch(color=color, label=label) for label, color in legend_labels.items()]
    plt.legend(handles=legend_handles, loc='lower center', ncol=3, bbox_to_anchor=(0.5, -0.05), fontsize=18)
    
    plt.show("notebook")
    

def plot_sunburst(data, year):
    fig = px.sunburst(
        data,
        path=['category', 'continent_0', 'continent_1'],
        values='counts',
        title=f"Sunburst Chart of Continent and Category by {year}"
    )
    fig.update_layout(width=1000, height=800, margin=dict(t=50, l=25, r=25, b=25))
    fig.show("notebook")

def unified_update(change):
    selected_index = change['new']
    selected_year = time_periods[selected_index]
    
    # Fetch and prepare data for the selected year
    sunburst_data = create_dataframe(data_certain, master_categories, int(selected_year))
    adjusted_data = adjust_dataframe_for_plotting(sunburst_data)
    
    # Clear previous output
    with output_area:
        clear_output(wait=True)
        
        # Display Sunburst for the selected year
        plot_sunburst(adjusted_data, selected_year)
        
        # Assume certain_collections and clear_uncertain_collections are defined correctly
        plot_treemap(selected_index, certain_collections[:,1:], "Collections of Clear Publication Time")
        plot_treemap(selected_index, clear_uncertain_collections[:,1:], "Collections of Clearly Uncertain Publication Time")
# Setup slider
slider = widgets.SelectionSlider(
    options=[(str(year), i) for i, year in enumerate(time_periods)],
    value=0,
    description='Select Year:',
    continuous_update=False,
    orientation='horizontal',
    readout=True
)

output_area = widgets.Output()
slider.observe(unified_update, names='value')

display(slider, output_area)
unified_update({'new': slider.value})  # Manually trigger the update at start to display initial plot

SelectionSlider(continuous_update=False, description='Select Year:', options=(('1500', 0), ('1550', 1), ('1600…

Output()

In [11]:
# import plotly.express as px
# hierarchy = create_dataframe(data_certain[data_certain.year <= 2000], master_categories)
# adjusted_hierarchy = adjust_dataframe_for_plotting(hierarchy)
# fig = px.sunburst(
#         adjusted_hierarchy,
#         path=['category', 'continent_0', 'continent_1'],
#         values='counts',
#         color = 'continent_1',
# #         labels='Key',
#         title=f"Sunburst Chart of Continent and Category by 2000"
# )
# fig.show('notebook')

In [None]:
import ipywidgets as widgets
from IPython.display import display, clear_output
# from IPython.display import HTML
import plotly
import plotly.express as px


time = [1500, 1550, 1600, 1650, 1700, 1750, 1800, 1850, 1900, 1950, 2000, 2022]

# Define color maps
category_color_map = {
    'text': '#636EFA',          # blue
    'audio': '#EF553B',        # red
    'visual media': '#00CC96', # teal
    'miscellaneous': '#AB63FA',# purple
    'IT': '#FFA15A',           # orange
    'undefined': '#FF6692'     # pink
}

continent_color_map = {
    'Europe': '#B6E880',
    'Unknown': '#FF97FF',
    'Other': '#FECB52',
    'Asia': '#FFA15A',
    'Africa': '#EF553B',
    'North America': '#19D3F3',
    'Oceania': '#FF6692',
    'South America': '#B6E880'
}

# Update graph function to handle slider changes
def update_graph(change):
    selected_year = change['new']  # Correctly access the new value
    hierarchy = create_dataframe(data_certain[data_certain.year <= selected_year], master_categories,selected_year)
    adjusted_hierarchy = adjust_dataframe_for_plotting(hierarchy)
    
    fig = px.sunburst(
        adjusted_hierarchy,
        path=['category', 'continent_0', 'continent_1'],
        values='counts',
        color='category',
        color_discrete_map={**category_color_map, **continent_color_map},  # Merge the two color map
        title=f"Sunburst Chart of Continent and Category by {selected_year}"
    )
    
    w=1000
    h=800

    # Check if the 'continent_1' column is effectively used
    if adjusted_hierarchy['continent_1'].nunique() > 1:
        margin_l = 25  # default margin
        margin_r = 25
    else:
        margin_l = 200  # increase left and right margins to center the plot
        margin_r = 200
        
    # Update traces to show text info
    fig.update_traces(textinfo='label+value')
        
        
    fig.update_layout(
        width=w,
        height=h,
        sunburstcolorway=[v for k, v in category_color_map.items()],  # specific colorway for sunburst
        margin=dict(t=50, l=margin_l, r=margin_r, b=25)
    )

    with out:
        clear_output(wait=True)
        fig.show("notebook")

# Setup the widget and output area
out = widgets.Output()
slider = widgets.IntSlider(
    value=min(time),
    min=min(time),
    max=max(time),
    step=50,
    description='Year:',
    continuous_update=False
)
display(slider, out)
slider.observe(update_graph, names='value')
update_graph({'new': slider.value})  # Initialize with the current slider value

IntSlider(value=1500, continuous_update=False, description='Year:', max=2022, min=1500, step=50)

Output()

In [13]:
# # HTML Template
# html_template = f"""
# <!DOCTYPE html>
# <html>
# <head>
#     <meta charset="UTF-8">
#     <title>Interactive Sunburst Chart</title>
#     <script src="https://cdn.plot.ly/plotly-latest.min.js"></script>
# </head>
# <body>
#     <input type="range" id="year-slider" min="1500" max="2022" value="1500" step="50">
#     <div id="chart"></div>
#     <script>
#         var initialConfig = {fig_json}; // Embed the figure JSON directly

#         function updateFigure(selectedYear) {{
#             // Deep copy to prevent mutation
#             var updatedConfig = JSON.parse(JSON.stringify(initialConfig));
#             updatedConfig.layout.title.text = "Sunburst Chart for Year " + selectedYear; // Dynamically update the title
#             Plotly.react('chart', updatedConfig.data, updatedConfig.layout);
#         }}

#         // Initial rendering of the plot
#         Plotly.newPlot('chart', initialConfig.data, initialConfig.layout);

#         // Add event listener to the slider
#         document.getElementById('year-slider').addEventListener('input', function() {{
#             updateFigure(this.value);
#         }});
#     </script>
# </body>
# </html>
# """

# # Display the HTML in the notebook for preview
# from IPython.core.display import display, HTML
# display(HTML(html_template))

In [14]:
# Write the HTML string to an HTML file
# with open('interactive_sunburst_chart.html', 'w') as f:
#     f.write(html_template)

# print("HTML file created successfully!")

In [15]:
# hierarchies = create_dataframe(data_certain[data_certain.year <= 1550],categories_1550)

# import plotly.express as px

# # Creating the sunburst chart
# fig = px.sunburst(
#     hierarchies, 
#     path=['Level 0', 'Level 1'],  # The hierarchy of your categories
#     values='Value',  # The values for each leaf node
#     labels='Key',  # The names for each leaf node
#     title="Sunburst Chart of Categories",  # Optional: Adds a title to your chart
#     width = 1000
# )

# # Set the size of the figure here
# fig.update_layout(
#     width=800,   # Adjust the width as needed
#     height=600,  # Adjust the height as needed
#     margin=dict(t=50, l=25, r=25, b=25)  # Adjust the margins if needed
# )

# # Show the figure
# fig.show()

# Publication Type Preprocessing

In [16]:
# import re

# def split_words_in_list(words_list):
#     result = []
#     for text in words_list:
#         # Split the text based on spaces or transitions from lowercase to uppercase
#         words = re.findall(r'[A-Z]?[a-z]+|[A-Z]+(?=[A-Z]|$)', text)
#         result.append(words)
#     return result

# unique_types = unique_types[np.logical_not(np.equal(unique_types, None))]
# cleaned_unique_types = split_words_in_list(unique_types)

In [17]:
# def categorize_data(processed_words_list, book_keywords, academic_keywords, 
#                     audio_keywords, art_keywords, computer_keywords, data_keywords):
#     categorized_data = {}
#     for entry in processed_words_list:
#         is_academic = any(word.lower() in academic_keywords or any(keyword.lower() in word.lower() for keyword in academic_keywords) for word in entry)
#         if is_academic:
#             categorized_data[' '.join(entry)] = 'academic'
#         else:
#             is_recording = any(word.lower() in audio_keywords or any(keyword.lower() in word.lower() for keyword in audio_keywords) for word in entry)
#             if is_recording:
#                 categorized_data[' '.join(entry)] = 'recording'
#             else:
#                 is_art = any(word.lower() in art_keywords or any(keyword.lower() in word.lower() for keyword in art_keywords) for word in entry)
#                 if is_art:
#                     categorized_data[' '.join(entry)] = 'art'
#                 else:
#                     is_computer = any(word.lower() in computer_keywords or any(keyword.lower() in word.lower() for keyword in computer_keywords) for word in entry)
#                     if is_computer:
#                         categorized_data[' '.join(entry)] = 'computer'
#                     else:
#                         is_data = any(word.lower() in book_keywords or any(keyword.lower() in word.lower() for keyword in data_keywords) for word in entry)
#                         if is_data:
#                             categorized_data[' '.join(entry)] = 'data'
#                         else:    
#                             is_book = any(word.lower() in book_keywords or any(keyword.lower() in word.lower() for keyword in book_keywords) for word in entry)
#                             if is_book:
#                                 categorized_data[' '.join(entry)] = 'book'
#                             else:
#                                 categorized_data[' '.join(entry)] = 'unknown'
    
#     return categorized_data

# # Example keywords related to books
# book_keywords = [
#     "book", "story", "stories", "fiction", "biography", "tale", "interpretation",
#     "journal", "broadside", "proclamation", "read", "translation", "almanac", 'Brtoadsides',
#     "periodi", "newspaper", "magazine","novel", "print", "text", "poetry", "poem","poezja", "sonnet","humor", "guide", "eriodical",
#     "memoir", "manual", "encyclopedia", "dictionar","recipe", "binding", "publica", "publish", "newsletter", "writing", "constitution"
# ]

# academic_keywords = [
#     "paper", "proceeding", "dissertation", "academic", "conference", "thesis", "annotation", "presentation", "sample", "proof", "guildeline", "treaties",
#      "exam", "guide", "law", "essay", "manuscript", "studies", "report", "literature", "quotation", "teach", "theory", "anecdote", "archive", "review", "reference", "outline",
#     "literary", "document", "bibliograph","criticism", "lecture", "abstracts", "survey", "work", "exam","problem", "question", "grammar","histor", "scientific"
# ]

# audio_keywords = ["song","sing","music","recording", "recorded","score","sound","audio", "symphon", "sonata","anthem","opera","rhyme","ballad","folklore", "speech",
#                   "narrat","commenta", "radio", "jazz", "concert","film","video", "dialog", "interview"]

# art_keywords = ["art","sculpture","paint","draw","theatre","theater","play","drama", "advertise", "portrait",
#                 "picture", "pictorial", "image","photo", "cartoon", "comic","manga", "poster", "lyric", "color", "game"]


# computer_keywords = ["software","computer", "http", "internet", "disc", "disk","code",
#                      "web", "program", "database","electronic", "online", "Elektronische"]

# data_keywords = ["map", "statisti", "chart", "graph", "atlas", "sheet", "address",
#                  "directory", "directories", "list","catalog","collection","form" ]

# # Apply categorization function to the processed words list
# categorized_data = categorize_data(cleaned_unique_types, book_keywords, academic_keywords,
#                                    audio_keywords, art_keywords, computer_keywords, data_keywords)


# # Get the number of items with the value of "unknown"
# unknown_count = sum(1 for category in categorized_data.values() if category == 'unknown')

# print("Number of items with value 'unknown':", unknown_count)

In [18]:
# unknown_keys = [key for key, value in categorized_data.items() if value == 'unknown']

# unknown_keys