In [2]:
import pandas as pd
import os
import re
import numpy as np
import matplotlib.pyplot as plt


In [3]:
%run time_cleaning.ipynb

<class 'int'>
Proportion of collections with certain publication year: 0.665
Proportion of collections with uncertain publication year: 0.335


In [None]:
folder_path = '/afs/inf.ed.ac.uk/user/s21/s2103701/Desktop/nls-catalogue-published-material_dc'

file_prefix = 'BIBLIOGRAPHIC_11573881650004341_'

file_end = '.xml'

data = pd.DataFrame()

for i in range(1,52):
    curr_file = file_prefix + str(i) + file_end
    file_path = os.path.join(folder_path, curr_file)

    df = pd.read_xml(file_path)

    data = pd.concat([data, df], ignore_index = True)

data
    


In [None]:
data = data[data.date.isna() == False]

In [None]:
def extract_four_digit_year(value):
    # Extract all digits from the value
    digits = re.findall(r'\d', str(value))
    
    # If no digits found or less than 4 digits, return None
    if not digits or len(digits) < 4:
        return None
    
    # Join the digits and convert to integer
    year = int(''.join(digits))
    
    # If exactly 4 digits, return the year, otherwise return None
    if len(str(year)) == 4:
        return year
    else:
        return None

In [None]:
unique_years = pd.unique(data['date'])

unique_years[:1000]


    


In [None]:
import numpy as np

def extract_year(s):
    if not isinstance(s, str) or not s:
        return 0

    # Extract all numbers in the string
    numbers = re.findall(r'\d+', s)

    # If there's only one number in the string
    if len(numbers) == 1:
        num = numbers[0]
        
        
        # If it's more than 4 digits long
        if len(num) > 4:
            
            # If it's exactly 8 digits long
            if len(num) == 8:
                first_4_digits = int(num[:4])
                
                # If the first 4 digits are less than or equal to 2023
                if first_4_digits <= 2023:
                    return first_4_digits

            return 0

    # 1. Direct match for a 4-digit year no more than 2023.
    match_4_digits = re.search(r'^(\d{4})$', s)
    if match_4_digits:
        year = int(match_4_digits.group(1))
        
        if year < 1000:
            return 0
        
        if year <= 2023:
            return year

    # 3. Match a longer number followed by a 4-digit year.
    match_long_then_4_digits = re.search(r'\d{5,}.*?(\d{4})', s)
    if match_long_then_4_digits:
        year = int(match_long_then_4_digits.group(1))
        if year <= 2023:
            return year

    # 4 & 5. Handle two 4-digit numbers.
    match_two_4_digits = re.search(r'(\d{4}).*?(\d{4})', s)
    if match_two_4_digits:
        first_year = int(match_two_4_digits.group(1))
        second_year = int(match_two_4_digits.group(2))
        if first_year > 2023 and second_year <= 2023:
            return second_year
        if first_year <= 2023 and second_year <= 2023:
            return second_year

    # 6. Match 4-digit years with "c", ".", or within "[]".
    match_special = re.search(r'(\d{4}[c\.]?|\[\d{4}\])', s)
    if match_special:
        year = int(match_special.group(1).rstrip("c.").strip("[]"))
        if year <= 2023:
            return year
    
    # 7 & 8. Handle year ranges.
    year_ranges = re.findall(r'(\d{4})-(\d{4})', s)
    if year_ranges:
        if len(year_ranges) == 1:
            start_year, end_year = map(int, year_ranges[0])
            # If end year is more than 2023, return the start year
            if end_year > 2023:
                return start_year
            # Return the average of the start and end year
            if start_year <= 2023 and end_year <= 2023:
                return (start_year + end_year) // 2
            
        # If there are two ranges found
        elif len(year_ranges) == 2:
            first_start, first_end = map(int, year_ranges[0])
            second_start, second_end = map(int, year_ranges[1])
            # If both start and end of the first range are greater than 2023
            if first_start > 2023 and first_end > 2023:
                return (second_start + second_end) // 2

    return 0

data['year'] = data['date'].apply(extract_year)


In [None]:
year_defined = data[(data.year.isna()) == False]
year_defined = year_defined[['title','creator','type','publisher', 
                             'date', 'language', 'subject','description','year']]

year_defined

In [None]:
year_language_defined = year_defined[(year_defined.language != 'und')&(year_defined.language != 'zxx')&(year_defined.language != 'd')&(year_defined.language != '- N') & (year_defined.language.isna() == False)]

year_language_defined

In [None]:
pd.unique(data['date'])

In [None]:
fig,ax = plt.subplots(figsize=(8,6))

centuries = [(1001,1100),(1101,1200),(1201,1300),(1301,1400),(1401, 1500), (1501, 1600), (1601, 1700), (1701, 1800), (1801, 1900), (1901, 2000),(2001,2023)]

x_axes=['11th century','12th century','13th century','14th century','15th century','16th century',
       '17th century','18th century','19th century','20th century','21st century',]

language_counts=[]

for century in centuries:

    subset = year_language_defined[(year_language_defined['year'] >= century[0]) & (year_language_defined['year'] <= century[1])]
    language_counts.append(len(pd.unique(subset['language'])))
    
plt.plot(x_axes, language_counts)

ax.set_xticklabels(x_axes, rotation=45)
ax.set_ylabel('number of languages', fontsize=15)

# Add labels to each data point
for i, j in zip(range(0,11), language_counts):
    if j == 15:
        plt.annotate(str(j), xy=(i, j+2.5), xytext=(5, 5), textcoords='offset points')
    elif j == 41:
        plt.annotate(str(j), xy=(i, j+3.5), xytext=(5, 5), textcoords='offset points')
    else:
        plt.annotate(str(j), xy=(i, j), xytext=(5, 5), textcoords='offset points')

print(language_counts)
plt.tight_layout()
plt.show()


212(no actual year), 976 --> 1976, 1000 --> 1776, 1004

In [None]:
fig, ax = plt.subplots(figsize=(8,6))


subset = year_language_defined[(year_language_defined['year'] >= 1401) & (year_language_defined['year'] <= 1500)]
language_counts = subset['language'].value_counts()

common_languages = []
rare_languages = []

# Split languages into common and rare based on a threshold (e.g., 10 occurrences)
threshold = 10
common_languages = language_counts[language_counts >= threshold]
rare_languages = language_counts[language_counts < threshold].sum()

lan_full_name = {'lat':'latin', 'ger':'german', 'ita': 'italian', 'eng': 'english',
                 'fre':'french', 'grc':'ancient greek','spa':'spanish', 'gre':'modern greek',
                 'dut':'dutch','gla':'scottish gaelic','sco':'scots gaelic', 'dan':'danish','rus':'russian',
                 'wel':'welsh', 'pol':'polish', 'chi':'chinese'}

# Replace the indexes with full names
common_languages.index = common_languages.index.map(lan_full_name)

# Create a "Other" category for rare languages
common_languages['Other'] = rare_languages
    
# Create a stacked histogram
common_languages.plot(kind='bar', stacked=True, ax=ax)
    
# Add labels above each bar
for i in ax.patches:
    ax.text(i.get_x() + i.get_width() / 2, i.get_height(), str(int(i.get_height())), ha='center', va='bottom')

# Rotate x-labels
ax.set_xticklabels(ax.get_xticklabels(), rotation=45)

ax.set_title('language distribution in 15th century', fontsize=15)

plt.tight_layout()
plt.show()


In [None]:
fig, ax = plt.subplots(figsize=(8,6))


subset = year_language_defined[(year_language_defined['year'] >= 1501) & (year_language_defined['year'] <= 1600)]
language_counts = subset['language'].value_counts()

common_languages = []
rare_languages = []

# Split languages into common and rare based on a threshold (e.g., 10 occurrences)
threshold = 100
common_languages = language_counts[language_counts >= threshold]
rare_languages = language_counts[language_counts < threshold].sum()


# Replace the indexes with full names
common_languages.index = common_languages.index.map(lan_full_name)

# Create a "Other" category for rare languages
common_languages['Other'] = rare_languages
    
# Create a stacked histogram
common_languages.plot(kind='bar', stacked=True, ax=ax)
    
# Add labels above each bar
for i in ax.patches:
    ax.text(i.get_x() + i.get_width() / 2, i.get_height(), str(int(i.get_height())), ha='center', va='bottom')

# Rotate x-labels
ax.set_xticklabels(ax.get_xticklabels(), rotation=45)

ax.set_title('language distribution in 16th century', fontsize=15)

plt.tight_layout()
plt.show()


In [None]:
fig, ax = plt.subplots(figsize=(8,6))


subset = year_language_defined[(year_language_defined['year'] >= 1601) & (year_language_defined['year'] <= 1700)]
language_counts = subset['language'].value_counts()

common_languages = []
rare_languages = []

# Split languages into common and rare based on a threshold (e.g., 10 occurrences)
threshold = 500
common_languages = language_counts[language_counts >= threshold]
rare_languages = language_counts[language_counts < threshold].sum()


# Replace the indexes with full names
common_languages.index = common_languages.index.map(lan_full_name)

# Create a "Other" category for rare languages
common_languages['Other'] = rare_languages
    
# Create a stacked histogram
common_languages.plot(kind='bar', stacked=True, ax=ax)
    
# Add labels above each bar
for i in ax.patches:
    ax.text(i.get_x() + i.get_width() / 2, i.get_height(), str(int(i.get_height())), ha='center', va='bottom')

# Rotate x-labels
ax.set_xticklabels(ax.get_xticklabels(), rotation=45)

ax.set_title('language distribution in 17th century', fontsize=15)

plt.tight_layout()
plt.show()


In [None]:
fig, ax = plt.subplots(figsize=(8,6))


subset = year_language_defined[(year_language_defined['year'] >= 1701) & (year_language_defined['year'] <= 1800)]
language_counts = subset['language'].value_counts()

common_languages = []
rare_languages = []

# Split languages into common and rare based on a threshold (e.g., 10 occurrences)
threshold = 500
common_languages = language_counts[language_counts >= threshold]
rare_languages = language_counts[language_counts < threshold].sum()


# Replace the indexes with full names
common_languages.index = common_languages.index.map(lan_full_name)

# Create a "Other" category for rare languages
common_languages['Other'] = rare_languages
    
# Create a stacked histogram
common_languages.plot(kind='bar', stacked=True, ax=ax)
    
# Add labels above each bar
for i in ax.patches:
    ax.text(i.get_x() + i.get_width() / 2, i.get_height(), str(int(i.get_height())), ha='center', va='bottom')

# Rotate x-labels
ax.set_xticklabels(ax.get_xticklabels(), rotation=45)

ax.set_title('language distribution in 18th century', fontsize=15)

plt.tight_layout()
plt.show()


In [None]:
fig, ax = plt.subplots(figsize=(8,6))


subset = year_language_defined[(year_language_defined['year'] >= 1801) & (year_language_defined['year'] <= 1900)]
language_counts = subset['language'].value_counts()

common_languages = []
rare_languages = []

# Split languages into common and rare based on a threshold (e.g., 10 occurrences)
threshold = 500
common_languages = language_counts[language_counts >= threshold]
rare_languages = language_counts[language_counts < threshold].sum()


# Replace the indexes with full names
common_languages.index = common_languages.index.map(lan_full_name)

# Create a "Other" category for rare languages
common_languages['Other'] = rare_languages
    
# Create a stacked histogram
common_languages.plot(kind='bar', stacked=True, ax=ax)
    
# Add labels above each bar
for i in ax.patches:
    ax.text(i.get_x() + i.get_width() / 2, i.get_height(), str(int(i.get_height())), ha='center', va='bottom')

# Rotate x-labels
ax.set_xticklabels(ax.get_xticklabels(), rotation=45)

ax.set_title('language distribution in 19th century', fontsize=15)

plt.tight_layout()
plt.show()


In [None]:
fig, ax = plt.subplots(figsize=(8,6))


subset = year_language_defined[(year_language_defined['year'] >= 1901) & (year_language_defined['year'] <= 2000)]
language_counts = subset['language'].value_counts()

common_languages = []
rare_languages = []

# Split languages into common and rare based on a threshold (e.g., 10 occurrences)
threshold = 5000
common_languages = language_counts[language_counts >= threshold]
rare_languages = language_counts[language_counts < threshold].sum()


# Replace the indexes with full names
common_languages.index = common_languages.index.map(lan_full_name)

# Create a "Other" category for rare languages
common_languages['Other'] = rare_languages
    
# Create a stacked histogram
common_languages.plot(kind='bar', stacked=True, ax=ax)
    
# Add labels above each bar
for i in ax.patches:
    ax.text(i.get_x() + i.get_width() / 2, i.get_height(), str(int(i.get_height())), ha='center', va='bottom')

# Rotate x-labels
ax.set_xticklabels(ax.get_xticklabels(), rotation=45)

ax.set_title('language distribution in 20th century', fontsize=15)

plt.tight_layout()
plt.show()


In [None]:
fig, ax = plt.subplots(figsize=(8,6))


subset = year_language_defined[(year_language_defined['year'] >=2001) & (year_language_defined['year'] <= 2023)]
language_counts = subset['language'].value_counts()

common_languages = []
rare_languages = []

# Split languages into common and rare based on a threshold (e.g., 10 occurrences)
threshold = 1000
common_languages = language_counts[language_counts >= threshold]
rare_languages = language_counts[language_counts < threshold].sum()


# Replace the indexes with full names
common_languages.index = common_languages.index.map(lan_full_name)

# Create a "Other" category for rare languages
common_languages['Other'] = rare_languages
    
# Create a stacked histogram
common_languages.plot(kind='bar', stacked=True, ax=ax)
    
# Add labels above each bar
for i in ax.patches:
    ax.text(i.get_x() + i.get_width() / 2, i.get_height(), str(int(i.get_height())), ha='center', va='bottom')

# Rotate x-labels
ax.set_xticklabels(ax.get_xticklabels(), rotation=45)

ax.set_title('language distribution in 21st century', fontsize=15)

plt.tight_layout()
plt.show()
