In [1]:
# Import the libraries
import pandas as pd
import numpy as np

In [2]:
# Read in file, which contains information of all books in the ENB
df_books = pd.read_parquet('enb_books.parquet', engine='pyarrow')

In [3]:
# Control if the file was read in correctly
df_books.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 311950 entries, 0 to 311949
Data columns (total 55 columns):
 #   Column                           Non-Null Count   Dtype         
---  ------                           --------------   -----         
 0   id                               311950 non-null  string        
 1   date_entered                     311402 non-null  datetime64[ns]
 2   isbn                             176505 non-null  string        
 3   creator                          190290 non-null  string        
 4   contributor                      189373 non-null  string        
 5   publisher                        300498 non-null  string        
 6   title                            311950 non-null  string        
 7   title_remainder                  167068 non-null  string        
 8   title_part_nr                    24584 non-null   string        
 9   title_part_nr_cleaned            24524 non-null   string        
 10  title_varform                    99930 non-n

In [4]:
# Define time periods
books_18_39 = df_books[(df_books['publication_date_cleaned'] >= 1918) & (df_books['publication_date_cleaned'] <= 1939)].copy()
books_40_91 = df_books[(df_books['publication_date_cleaned'] >= 1940) & (df_books['publication_date_cleaned'] <= 1991)].copy()

# Count occurrences of harmonised place names
books_18_39_counts = books_18_39.groupby(['publication_place_harmonized', 'publication_place_latitude', 'publication_place_longitude']).size().reset_index(name='book_count')
books_40_91_counts = books_40_91.groupby(['publication_place_harmonized', 'publication_place_latitude', 'publication_place_longitude']).size().reset_index(name='book_count')

# Calculate the total number of books across both periods
total_counts = pd.concat([books_18_39_counts.assign(period='1918-1939'), books_40_91_counts.assign(period='1940-1991')])
total_counts_by_location = total_counts.groupby(['publication_place_latitude', 'publication_place_longitude']).agg(
    total_book_count=('book_count', 'sum')
).reset_index()

# Calculate log counts for normalisation
total_counts_by_location['log_book_count'] = np.log1p(total_counts_by_location['total_book_count'])  # log1p handles log(0) case

# Determine the maximum total book count for normalisation
max_total_count = total_counts_by_location['log_book_count'].max()

# Normalise the book counts in each dataset based on the overall maximum
books_18_39_counts['log_book_count'] = np.log1p(books_18_39_counts['book_count'])
books_18_39_counts['normalized_book_count'] = books_18_39_counts['log_book_count'] / max_total_count

books_40_91_counts['log_book_count'] = np.log1p(books_40_91_counts['book_count'])
books_40_91_counts['normalized_book_count'] = books_40_91_counts['log_book_count'] / max_total_count

# Save the dataframes as CSV files, which would be further visualised with QGIS
books_18_39_counts.to_csv('books_18_39_counts.csv', index=False)
books_40_91_counts.to_csv('books_40_91_counts.csv', index=False)

In [5]:
# Focusing on how many books were published in Stockholm, where most Estonian diaspora literature was published during the Cold War
publication_place_stockholm = books_40_91[books_40_91['publication_place_harmonized'] == 'Stockholm']
publication_place_stockholm.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 1340 entries, 209 to 311925
Data columns (total 55 columns):
 #   Column                           Non-Null Count  Dtype         
---  ------                           --------------  -----         
 0   id                               1340 non-null   string        
 1   date_entered                     1339 non-null   datetime64[ns]
 2   isbn                             342 non-null    string        
 3   creator                          895 non-null    string        
 4   contributor                      642 non-null    string        
 5   publisher                        1311 non-null   string        
 6   title                            1340 non-null   string        
 7   title_remainder                  795 non-null    string        
 8   title_part_nr                    151 non-null    string        
 9   title_part_nr_cleaned            150 non-null    string        
 10  title_varform                    222 non-null    string 

In [6]:
# Investigating in which languages the books were written
publication_place_stockholm['language'].value_counts()

est    776
swe    325
eng    162
ger     62
fre      7
fin      5
spa      1
cze      1
rus      1
Name: language, dtype: Int64

In [7]:
# Investigating certain topic keywords, such as "eesti" ("Estonia") and "väliseesti" ("exile Estonia")
value1 = "eesti [EMS174969]"
value2 = "väliseesti [EMS021662]"
topic_keyword_stockholm = publication_place_stockholm[
    publication_place_stockholm['topic_keyword'].fillna('').apply(
        lambda x: (
            value1 == x or value2 == x or
            value1 in [part.strip() for part in x.split(';')] or value2 in [part.strip() for part in x.split(';')]
        )
    )
]

In [8]:
# Studying how many books were published, which had these topics as the content
topic_keyword_stockholm.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 520 entries, 1676 to 311925
Data columns (total 55 columns):
 #   Column                           Non-Null Count  Dtype         
---  ------                           --------------  -----         
 0   id                               520 non-null    string        
 1   date_entered                     520 non-null    datetime64[ns]
 2   isbn                             129 non-null    string        
 3   creator                          298 non-null    string        
 4   contributor                      320 non-null    string        
 5   publisher                        515 non-null    string        
 6   title                            520 non-null    string        
 7   title_remainder                  366 non-null    string        
 8   title_part_nr                    48 non-null     string        
 9   title_part_nr_cleaned            47 non-null     string        
 10  title_varform                    84 non-null     string 