In [2]:
import numpy as np
import pandas as pd
import networkx as nx
import re
import matplotlib.pyplot as plt
import seaborn as sns
from scipy import stats

## Read the book metadata dataset

In [3]:
df = pd.read_parquet('metadata_en.parquet')
df

Unnamed: 0,title,creator,birthdate,deathdate,publisher,issued,rights,language,subjects,description,Index
1,The Master Spirit,"Magnay, William, Sir",1855,1917,Project Gutenberg,2022-03-25,Public domain in the USA.,en,"[London (England) -- Fiction, Love stories, Ad...",[Reading ease score: 71.3 (7th grade). Fairly ...,67703
2,The Family on Wheels,"Oxley, J. Macdonald (James Macdonald)",1855,1907,Project Gutenberg,2017-11-05,Public domain in the USA.,en,"[Orphans -- Juvenile fiction, Siblings -- Juve...",[Reading ease score: 77.9 (7th grade). Fairly ...,55891
3,Three Good Giants\nWhose Ancient Deeds are rec...,"Rabelais, François",1835,1901,Project Gutenberg,2019-04-09,Public domain in the USA.,en,"[Fantasy fiction, Conduct of life -- Juvenile ...",[Reading ease score: 76.2 (7th grade). Fairly ...,59235
4,Drake's Road Book of the Grand Junction Railwa...,"Drake, James, active 1825",,,Project Gutenberg,2013-07-31,Public domain in the USA.,en,[Grand Junction Railway Company (Great Britain...,[Reading ease score: 65.5 (8th & 9th grade). N...,43367
5,Facts and Speculations on the Origin and Histo...,"Chatto, William Andrew",1799,1864,Project Gutenberg,2014-05-04,Public domain in the USA.,en,"[Playing cards -- History, GV]",[Reading ease score: 62.7 (8th & 9th grade). N...,45584
...,...,...,...,...,...,...,...,...,...,...,...
74677,Reform and Politics\r\nPart 2 from The Works o...,"Whittier, John Greenleaf",1807,1892,Project Gutenberg,2005-12-01,Public domain in the USA.,en,"[United States -- Politics and government, PS]",[Reading ease score: 49.7 (College-level). Dif...,9596
74678,"The Speedwell Boys and Their Ice Racer; Or, Lo...","Rockwood, Roy",,,Project Gutenberg,2015-06-07,Public domain in the USA.,en,"[Iceboating -- Juvenile fiction, PZ]",[Reading ease score: 83.7 (6th grade). Easy to...,49162
74680,The Starling: A Scottish Story,"Macleod, Norman",1812,1872,Project Gutenberg,2013-02-12,Public domain in the USA.,en,"[Clergy -- Fiction, Villages -- Fiction, Scotl...",[Reading ease score: 78.4 (7th grade). Fairly ...,41989
74682,"Chronicles of England, Scotland and Ireland (2...","Holinshed, Raphael",,,Project Gutenberg,2005-09-25,Public domain in the USA.,en,"[Great Britain -- History -- Tudors, 1485-1603...",[Reading ease score: 58.5 (10th to 12th grade)...,16749


Basic information about the dataset

In [4]:
print(f"Dataframe's shape: {df.shape}\n")
print(df.info())

Dataframe's shape: (59852, 11)

<class 'pandas.core.frame.DataFrame'>
Int64Index: 59852 entries, 1 to 74683
Data columns (total 11 columns):
 #   Column       Non-Null Count  Dtype 
---  ------       --------------  ----- 
 0   title        59847 non-null  object
 1   creator      59698 non-null  object
 2   birthdate    49715 non-null  object
 3   deathdate    48613 non-null  object
 4   publisher    59852 non-null  object
 5   issued       59852 non-null  object
 6   rights       59852 non-null  object
 7   language     59852 non-null  object
 8   subjects     59852 non-null  object
 9   description  59852 non-null  object
 10  Index        59852 non-null  object
dtypes: object(11)
memory usage: 5.5+ MB
None


## Data Preprocessing

In [5]:
print("Birthdate's unique values:\n", df['birthdate'].unique())
print("Deathdate's unique values:\n", df['deathdate'].unique())

Birthdate's unique values:
 ['1855' '1835' None '1799' '1836' '1873' '1804' '1803' '1837' '1911'
 '1859' '1812' '1865' '1813' '1851' '1849' '1862' '1879' '1846' '1860'
 '1885' '1872' '1822' '1910' '1814' '1876' '1893' '1903' '1882' '1852'
 '1785' '1838' '1877' '1834' '1824' '1870' '1861' '1802' '1868' '1842'
 '1892' '1858' '1664' '1866' '1880' '1867' '1881' '1854' '1841' '1839'
 '1856' '1848' '1853' '1875' '1863' '1828' '1871' '1772' '1923' '1830'
 '1751' '1889' '1794' '1831' '1899' '1878' '1925' '1922' '1753' '1832'
 '1883' '1633' '1817' '1907' '1811' '1845' '1869' '1747' '1850' '1971'
 '1833' '1717' '1914' '1826' '1840' '1930' '1791' '1810' '1801' '1844'
 '1564' '1928' '1857' '1792' '1901' '1787' '1703' '1821' '1888' '1827'
 '1819' '1890' '1805' '1809' '1843' '1797' '1783' '1820' '1776' '1641'
 '1511' '1864' '1912' '1823' '1886' '1927' '1847' '1789' '1469' '1806'
 '1755' '1497' '1829' '1894' '1760' '1897' '1775' '-431' '1527' '1896'
 '1807' '1796' '1887' '1771' '1728' '1918' '1825' '

In [6]:
df['birthdate'] = df['birthdate'].astype('Int64')
df['deathdate'] = df['deathdate'].astype('Int64')

print(df.info())

display(df[['birthdate', 'deathdate']].head())

<class 'pandas.core.frame.DataFrame'>
Int64Index: 59852 entries, 1 to 74683
Data columns (total 11 columns):
 #   Column       Non-Null Count  Dtype 
---  ------       --------------  ----- 
 0   title        59847 non-null  object
 1   creator      59698 non-null  object
 2   birthdate    49715 non-null  Int64 
 3   deathdate    48613 non-null  Int64 
 4   publisher    59852 non-null  object
 5   issued       59852 non-null  object
 6   rights       59852 non-null  object
 7   language     59852 non-null  object
 8   subjects     59852 non-null  object
 9   description  59852 non-null  object
 10  Index        59852 non-null  object
dtypes: Int64(2), object(9)
memory usage: 5.6+ MB
None


Unnamed: 0,birthdate,deathdate
1,1855.0,1917.0
2,1855.0,1907.0
3,1835.0,1901.0
4,,
5,1799.0,1864.0


In [7]:
df['issued'] = pd.to_datetime(df['issued'])

df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 59852 entries, 1 to 74683
Data columns (total 11 columns):
 #   Column       Non-Null Count  Dtype         
---  ------       --------------  -----         
 0   title        59847 non-null  object        
 1   creator      59698 non-null  object        
 2   birthdate    49715 non-null  Int64         
 3   deathdate    48613 non-null  Int64         
 4   publisher    59852 non-null  object        
 5   issued       59852 non-null  datetime64[ns]
 6   rights       59852 non-null  object        
 7   language     59852 non-null  object        
 8   subjects     59852 non-null  object        
 9   description  59852 non-null  object        
 10  Index        59852 non-null  object        
dtypes: Int64(2), datetime64[ns](1), object(8)
memory usage: 5.6+ MB


In [8]:
print("Number of unique values in the publisher column: {0}".format(
    df['publisher'].unique().shape[0]
))

Number of unique values in the publisher column: 1


In [9]:
del df['publisher']

display(df.head())

Unnamed: 0,title,creator,birthdate,deathdate,issued,rights,language,subjects,description,Index
1,The Master Spirit,"Magnay, William, Sir",1855.0,1917.0,2022-03-25,Public domain in the USA.,en,"[London (England) -- Fiction, Love stories, Ad...",[Reading ease score: 71.3 (7th grade). Fairly ...,67703
2,The Family on Wheels,"Oxley, J. Macdonald (James Macdonald)",1855.0,1907.0,2017-11-05,Public domain in the USA.,en,"[Orphans -- Juvenile fiction, Siblings -- Juve...",[Reading ease score: 77.9 (7th grade). Fairly ...,55891
3,Three Good Giants\nWhose Ancient Deeds are rec...,"Rabelais, François",1835.0,1901.0,2019-04-09,Public domain in the USA.,en,"[Fantasy fiction, Conduct of life -- Juvenile ...",[Reading ease score: 76.2 (7th grade). Fairly ...,59235
4,Drake's Road Book of the Grand Junction Railwa...,"Drake, James, active 1825",,,2013-07-31,Public domain in the USA.,en,[Grand Junction Railway Company (Great Britain...,[Reading ease score: 65.5 (8th & 9th grade). N...,43367
5,Facts and Speculations on the Origin and Histo...,"Chatto, William Andrew",1799.0,1864.0,2014-05-04,Public domain in the USA.,en,"[Playing cards -- History, GV]",[Reading ease score: 62.7 (8th & 9th grade). N...,45584


In [10]:
print("Number of unique values in the language column: {0}".format(
    df['language'].unique().shape[0]
))

Number of unique values in the language column: 1


In [11]:
if 'language' in df.columns:
    del df['language']

    display(df.head())

Unnamed: 0,title,creator,birthdate,deathdate,issued,rights,subjects,description,Index
1,The Master Spirit,"Magnay, William, Sir",1855.0,1917.0,2022-03-25,Public domain in the USA.,"[London (England) -- Fiction, Love stories, Ad...",[Reading ease score: 71.3 (7th grade). Fairly ...,67703
2,The Family on Wheels,"Oxley, J. Macdonald (James Macdonald)",1855.0,1907.0,2017-11-05,Public domain in the USA.,"[Orphans -- Juvenile fiction, Siblings -- Juve...",[Reading ease score: 77.9 (7th grade). Fairly ...,55891
3,Three Good Giants\nWhose Ancient Deeds are rec...,"Rabelais, François",1835.0,1901.0,2019-04-09,Public domain in the USA.,"[Fantasy fiction, Conduct of life -- Juvenile ...",[Reading ease score: 76.2 (7th grade). Fairly ...,59235
4,Drake's Road Book of the Grand Junction Railwa...,"Drake, James, active 1825",,,2013-07-31,Public domain in the USA.,[Grand Junction Railway Company (Great Britain...,[Reading ease score: 65.5 (8th & 9th grade). N...,43367
5,Facts and Speculations on the Origin and Histo...,"Chatto, William Andrew",1799.0,1864.0,2014-05-04,Public domain in the USA.,"[Playing cards -- History, GV]",[Reading ease score: 62.7 (8th & 9th grade). N...,45584


In [12]:
if 'description' in df.columns:
    reading_ease_scores = list()
    reading_ease_levels = list()

    for i, descriptions in enumerate(df['description']):
        score_found = False
        level_found = False
        
        for description in descriptions:
            if re.search("Reading ease score:", description):
                score_match = re.findall("Reading ease score: ([\d\.]+)", description)
                if score_match:
                    reading_ease_scores.append(float(score_match[0]))
                    score_found = True
                
                level_match = re.findall("Reading ease score:\s[\d\.]+\s\((.+)\)", description)
                if level_match:
                    reading_ease_levels.append(level_match[0])
                    level_found = True
        
        if not score_found:
            reading_ease_scores.append(None)
        if not level_found:
            reading_ease_levels.append(None)

    df['reading ease score'] = reading_ease_scores
    df['reading ease level'] = reading_ease_levels

    del df['description']

df.head()

Unnamed: 0,title,creator,birthdate,deathdate,issued,rights,subjects,Index,reading ease score,reading ease level
1,The Master Spirit,"Magnay, William, Sir",1855.0,1917.0,2022-03-25,Public domain in the USA.,"[London (England) -- Fiction, Love stories, Ad...",67703,71.3,7th grade
2,The Family on Wheels,"Oxley, J. Macdonald (James Macdonald)",1855.0,1907.0,2017-11-05,Public domain in the USA.,"[Orphans -- Juvenile fiction, Siblings -- Juve...",55891,77.9,7th grade
3,Three Good Giants\nWhose Ancient Deeds are rec...,"Rabelais, François",1835.0,1901.0,2019-04-09,Public domain in the USA.,"[Fantasy fiction, Conduct of life -- Juvenile ...",59235,76.2,7th grade
4,Drake's Road Book of the Grand Junction Railwa...,"Drake, James, active 1825",,,2013-07-31,Public domain in the USA.,[Grand Junction Railway Company (Great Britain...,43367,65.5,8th & 9th grade
5,Facts and Speculations on the Origin and Histo...,"Chatto, William Andrew",1799.0,1864.0,2014-05-04,Public domain in the USA.,"[Playing cards -- History, GV]",45584,62.7,8th & 9th grade


In [13]:
print(stats.kurtosis(df['reading ease score'], nan_policy='omit'))

0.1676668769189571


In [14]:
df['subjects'] = df['subjects'].apply(lambda x: [item for sublist in [s.split(' -- ') for s in x] for item in sublist])

display(df[['title', 'subjects']].head())

Unnamed: 0,title,subjects
1,The Master Spirit,"[London (England), Fiction, Love stories, Adve..."
2,The Family on Wheels,"[Orphans, Juvenile fiction, Siblings, Juvenile..."
3,Three Good Giants\nWhose Ancient Deeds are rec...,"[Fantasy fiction, Conduct of life, Juvenile fi..."
4,Drake's Road Book of the Grand Junction Railwa...,[Grand Junction Railway Company (Great Britain...
5,Facts and Speculations on the Origin and Histo...,"[Playing cards, History, GV]"


In [15]:
from collections import Counter

subject_list = df['subjects'].dropna().explode()

subject_counts = Counter(subject_list)
subject_df = pd.DataFrame(subject_counts.items(), columns=['subject', 'count']).sort_values(by='count', ascending=False).reset_index(drop=True)
subject_indexing = {subject: idx for idx, subject in enumerate(subject_df['subject'])}


co_occurrence_matrix = np.zeros((subject_df.shape[0], subject_df.shape[0]))
for subjects in df['subjects'].dropna():
    subjects = [subject for subject in subjects if subject in subject_indexing]  # Filter subjects to those in the set
    for i in range(len(subjects)):
        for j in range(i + 1, len(subjects)):
            idx1 = subject_indexing[subjects[i]]
            idx2 = subject_indexing[subjects[j]]
            co_occurrence_matrix[idx1, idx2] += 1
            co_occurrence_matrix[idx2, idx1] += 1 

print(co_occurrence_matrix)

[[6.8750e+04 2.0400e+02 1.7010e+04 ... 0.0000e+00 0.0000e+00 0.0000e+00]
 [2.0400e+02 8.9248e+04 5.3500e+02 ... 0.0000e+00 0.0000e+00 5.0000e+00]
 [1.7010e+04 5.3500e+02 0.0000e+00 ... 0.0000e+00 0.0000e+00 0.0000e+00]
 ...
 [0.0000e+00 0.0000e+00 0.0000e+00 ... 0.0000e+00 0.0000e+00 0.0000e+00]
 [0.0000e+00 0.0000e+00 0.0000e+00 ... 0.0000e+00 0.0000e+00 0.0000e+00]
 [0.0000e+00 5.0000e+00 0.0000e+00 ... 0.0000e+00 0.0000e+00 0.0000e+00]]


In [18]:
import plotly.graph_objects as go

G = nx.Graph()


for subjects in df['subjects'].dropna().head(1000):
    for i, subject1 in enumerate(subjects):
        for subject2 in subjects[i+1:]:
            if G.has_edge(subject1, subject2):
                G[subject1][subject2]['weight'] += 1
            else:
                G.add_edge(subject1, subject2, weight=1)

G_filtered = nx.Graph((u, v, e) for u, v, e in G.edges(data=True) if e['weight'] > 5)

pos = nx.spring_layout(G_filtered, k=0.1, seed=42)

edge_x = []
edge_y = []
for edge in G_filtered.edges():
    x0, y0 = pos[edge[0]]
    x1, y1 = pos[edge[1]]
    edge_x.extend([x0, x1, None])
    edge_y.extend([y0, y1, None])

node_x = []
node_y = []
node_size = []
node_text = []
for node in G_filtered.nodes():
    x, y = pos[node]
    node_x.append(x)
    node_y.append(y)
    node_size.append(G_filtered.degree[node])
    node_text.append(f"{node}<br>Connections: {G_filtered.degree[node]}")

edge_trace = go.Scatter(
    x=edge_x,
    y=edge_y,
    line=dict(width=0.5, color='#888'),
    hoverinfo='none',
    mode='lines'
)

node_color = [np.log(degree + 1) for degree in node_size]  # Log scale for color
node_colorbar = [degree for degree in node_size]  # Original degree counts for colorbar ticks

node_trace = go.Scatter(
    x=node_x,
    y=node_y,
    mode='markers',
    hoverinfo='text',
    marker=dict(
        showscale=True,
        colorscale='YlGnBu',  # Color gradient for log-scaled hue
        size=10,              # Fixed node size (no change)
        color=node_color,     # Logarithmic scale for color
        colorbar=dict(
            thickness=15,
            title="Node Connections",
            tickvals=[np.log(i + 1) for i in range(1, max(node_size) + 1, int(max(node_size)/5))],  # Adjust ticks based on log scale
            ticktext=[str(i) for i in range(1, max(node_size) + 1, int(max(node_size)/5))],  # Display original counts on colorbar ticks
            xanchor='left',
            titleside='right'
        )
    ),
    text=node_text
)
fig = go.Figure(data=[edge_trace, node_trace],
                layout=go.Layout(
                    title='Interactive Tag Co-occurrence Network',
                    titlefont_size=16,
                    showlegend=False,
                    hovermode='closest',
                    margin=dict(b=0, l=0, r=0, t=40),
                    xaxis=dict(showgrid=False, zeroline=False),
                    yaxis=dict(showgrid=False, zeroline=False)
                ))

fig.show()