# 1. Imports

In [15]:
import gc
import re
import numpy as np
import pandas as pd
from sklearn.preprocessing import MultiLabelBinarizer
from string import punctuation

import plotly.express as px

import atel
from atel.data import BookCollection

# 2. Load data - Child Written Books

In [16]:
# load data
book_col = BookCollection(data_file='../data/book_col_271120.pkl')

# Total number of books
print(f'Number of books: {book_col.num_books}')

Loaded from disk: ../data/book_col_271120.pkl
Number of books: 803


## 2.1 Get all attributes for each book

In [17]:
data = []

for i, book in enumerate(book_col):
    if book.code_dict is None:
        print(f'Book with index {i} is None')
        continue
    book.code_dict['book_id'] = i 
    data.append(book.code_dict)

Book with index 509 is None


In [18]:
book_df = pd.DataFrame(data)

In [19]:
list_cols = [
    'Genre', 'Attitude', 'Stavning', 'Perspektiv', 'Tekstbånd', 'Fremstillingsform', 
    'Semantisk univers', 'Stemmer', 'Forbindere', 'Interjektioner'
]

In [20]:
## Replace empty strings with NaN values
book_df = book_df.replace('', np.NaN)

## Explode all columns
# Exploded books
ex_book_df = book_df.copy(deep=True)
for col in list_cols:
    ex_book_df = ex_book_df.explode(col)

# del book_df
# gc.collect()

In [21]:
ex_book_df.head()

Unnamed: 0,Genre,Attitude,Stavning,Perspektiv,Tekstbånd,Fremstillingsform,Semantisk univers,Holistisk vurdering,Stemmer,book_id,Forbindere,Interjektioner,Metaforbrug,Noter
0,Selvportræt,Bedømmelse positiv,Semifonetisk,Eksplicit fortæller,,Beskrivende,Mennesker,1,En stemme,0,,,,
1,Rim og remser,,Fonetisk,Implicit fortæller,Semantisk,Beskrivende,Vilde dyr,1,En stemme,1,,,,
2,,Følelse 1. person,Semifonetisk,Eksplicit fortæller,,Beskrivende,Mig selv,0,En stemme,2,Tilføjelse,,,
3,Hverdagsbeskrivelse,,Semifonetisk,Implicit fortæller,,Beskrivende,Kæledyr,1,En stemme,3,,,,
3,Hverdagsbeskrivelse,,Fonetisk,Implicit fortæller,,Beskrivende,Kæledyr,1,En stemme,3,,,,


In [26]:
# Replace "Vilde dyr " with "Vilde dyr" (space difference)
ex_book_df[ex_book_df['Semantisk univers'] == 'Vilde dyr\xa0']['Semantisk univers'] = 'Vilde dyr'



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy



## 2.2 Plot distributions of classification columns

In [27]:
def plot_distribution(category: str):
    px.histogram(ex_book_df[['book_id', category]].drop_duplicates(), x=category)\
      .update_xaxes(categoryorder='total descending').show()

In [28]:
plot_distribution('Genre')

## 2.3 Get all senteces of each book

In [48]:
def print_book_sentences(book_id: int):
    print(book_col[book_id].get_fulltext())

In [37]:
def clean_book_text(book: atel.data.Book):
    s = book.get_fulltext() 
    s = s.replace('\t', ' ').replace('\n', ' ')
    s = re.sub('[^[a-zA-Z0-9æøåÆØÅ\s]', ' ', s)
    s = re.sub('\s+', ' ', s)  # removes trailing whitespaces
    s = s.lower().strip()
    
    return s

In [52]:
def clean_book_collection_texts(book_col: atel.data.BookCollection, include_empty_texts: bool=False):
    book_ids = []
    texts    = []
    
    for i, book in enumerate(book_col):
        s = clean_book_text(book)
        
        if not include_empty_texts and s != '':
            texts.append(s)
            book_ids.append(i)
    
    return book_ids, texts