In [1]:
import os
import sys

os.getcwd()
os.chdir('/home/thabib/study_behavior_analysis/')
sys.path.append('/home/thabib/study_behavior_analysis/src/SidBERT')

In [2]:
import numpy as np
import pandas as pd
import h5py
import matplotlib.pyplot as plt
import seaborn as sns
from IPython.display import display
from ast import literal_eval

from utils import settings
from preprocessing.booktitle_language_extractor import extract_book_language

project_root = settings.get_project_root()

In [39]:
# def set_pandas_display_options() -> None:
#     """Set pandas display options."""
#     # Ref: https://stackoverflow.com/a/52432757/
#     display = pd.options.display

#     display.max_columns = 1000
#     display.max_rows = 1000
#     display.max_colwidth = 199
#     display.width = 1000
#     # display.precision = 2  # set as needed

# set_pandas_display_options()


pd.options.display.max_rows = 3000

In [4]:
classes_to_extract = list(np.arange(10).astype('str'))
original_only = True
include_language = True
include_lang_probs = True
include_root_class = True

In [29]:
do_language_extraction = False

# Perform language detection
if do_language_extraction:
    
    save_data_to_disk = True
    
    dataset_lang = extract_book_language(root_classes=classes_to_extract,
                                         original_only=original_only,
                                         include_root_class=include_root_class,
                                         include_language=include_language,
                                         include_lang_probs=include_lang_probs,
                                         random_seed=1,
                                         save_to_disk=save_data_to_disk)

    
# OR, load pre-saved PCA embeddings from disk (for downstream visualization)
else:
    filename = 'Classes{}_OriginalOnly{}_Lang{}_Probs{}'.format(''.join(classes_to_extract),
                                                                original_only,
                                                                include_language,
                                                                include_lang_probs)
    
    data_path = os.path.join(project_root, 'src', 'data', 'SidBERT_data', 'book_ddc_data')
    
    dataset_lang = pd.read_csv(os.path.join(project_root, data_path, filename+'.csv'),
                                usecols=['index', 'Title', 'DDC', 'language', 'language_probs', 'root_class'],
                                #usecols=['index', 'Title', 'DDC', 'language', 'root_class'],
                                dtype={'index': np.int64, 'Title': str, 'DDC': str, 'root_class': str},
                                converters={'language': literal_eval, 'language_probs': literal_eval})

In [30]:
dataset_lang

Unnamed: 0,index,Title,DDC,language,language_probs,root_class
0,239,"Self-producing systems, implications and appli...",0037,[English],[0.9999965268438771],0
1,424,National information policies and strategies: ...,025,[English],[0.9999953898858631],0
2,771,"Extensions of the UNITY methodology, compositi...",0052,[English],[0.9999980245573832],0
3,772,"4th European workshop, EWSPT '95, Noordwijkerh...",0051,[English],[0.9999953020047926],0
4,773,"Theoretical informatics, proceedings",004,[English],[0.999997360951342],0
...,...,...,...,...,...,...
542062,603177,Römische Kultur im Bilde,937,[German],[0.9999959464326249],9
542063,603640,Fiefs and vassals : the medieval evidence rein...,9401,"[English, Dutch]","[0.5714276691406047, 0.4285709777966101]",9
542064,604094,Die Frauenfrage : ihre geschichtliche Entwickl...,940,[German],[0.9999964485105741],9
542065,604245,Psychologie der Geschichte,901,[German],[0.9999965244453901],9


In [31]:
def language_extractor(language_str='english', exclusive=False, confident=True):
    
    all_ = np.full(len(dataset_lang), False, dtype='bool')
    
    if exclusive:
        for i in range(len(dataset_lang)):
            if len(dataset_lang['language'][i]) == 1:
                if dataset_lang['language'][i][0] == language_str.capitalize():
                    all_[i] = True
    else:
        if confident:
            for i in range(len(dataset_lang)):
                if dataset_lang['language'][i][0] == language_str.capitalize():
                    all_[i] = True
        else:
            for i in range(len(dataset_lang)):
                if language_str.capitalize() in dataset_lang['language'][i]:
                    all_[i] = True

    dt_extract = dataset_lang[all_].reset_index(drop=True)
            
    return dt_extract


def extract_particular_(languages_list=None):
    
    all_ = np.full(len(dataset_lang), False, dtype='bool')
    
    for i in range(len(dataset_lang)):
        if dataset_lang['language'][i] == languages_list:
            all_[i] = True
            
    dt_extract = dataset_lang[all_].reset_index(drop=True)
            
    return dt_extract
    

In [40]:
dataset_lang['language'].value_counts()

[German]                                                         331575
[English]                                                        164724
[French]                                                           7659
[Italian]                                                          4578
[German, English]                                                  3813
[English, German]                                                  2264
[Spanish]                                                          2124
[English, Italian]                                                 1972
[English, French]                                                  1342
[German, Swedish]                                                   758
[English, Catalan]                                                  741
[German, Danish]                                                    684
[German, Afrikaans]                                                 664
[English, Romanian]                                             

In [22]:
dt_extract = language_extractor(language_str='German', exclusive=True)
dt_extract

Unnamed: 0.1,Unnamed: 0,index,Title,DDC,Description,language,root_class
0,28,3791,"Künstliche Intelligenz, philosophische Probleme",63,,[German],0
1,30,4097,"Die Wundermaschine, die unendliche Geschichte ...",4,Ein österreichischer Wirtschaftshistoriker hat...,[German],0
2,31,4148,"Basiskenntnis Bibliothek, eine Fachkunde für F...",20,Aus dem Vorwort: Die vorherigen Auflagen basie...,[German],0
3,32,4200,"Königsberger Buch- und Bibliotheksgeschichte, ...",20,,[German],0
4,34,4967,"Die Bibliotheca Albertina in Leipzig, Festschr...",20,,[German],0
...,...,...,...,...,...,...,...
331570,542061,603176,Die Befreiung des Prometheus : ein Fund aus Pe...,9301,,[German],9
331571,542062,603177,Römische Kultur im Bilde,937,,[German],9
331572,542064,604094,Die Frauenfrage : ihre geschichtliche Entwickl...,940,,[German],9
331573,542065,604245,Psychologie der Geschichte,901,,[German],9


In [41]:
dt_extract = extract_particular_(languages_list=['German', 'English'])
dt_extract

Unnamed: 0,index,Title,DDC,language,language_probs,root_class
0,31336,"Aus- und Weiterbildung zum ""Certified Professi...",004,"[German, English]","[0.5714282923942672, 0.4285715329509253]",0
1,39717,"SQL thinking, vom Problem zum SQL-Statement",004,"[German, English]","[0.8571387711865484, 0.14285881374457793]",0
2,39769,Semantische Integration von Data Warehousing u...,004,"[German, English]","[0.8571396692653102, 0.14285815705708108]",0
3,40622,"Copia librorum, Problemgeschichte imaginierter...",020,"[German, English]","[0.8571415166929515, 0.1428572087925721]",0
4,41234,"4. e-Learning Fachtagung Informatik, 11. - 14....",004,"[German, English]","[0.8571415296306534, 0.14285761022287174]",0
...,...,...,...,...,...,...
3808,309424,Antiochos IV. Epiphanes : eine politische Biog...,909,"[German, English]","[0.5714273692873775, 0.4285704876112323]",9
3809,313745,Visual history : ein Studienbuch,900,"[German, English]","[0.857139516539694, 0.14285758168309257]",9
3810,475460,Max Weber und Vilfredo Pareto : Dialog und Kon...,9363,"[German, English]","[0.5714282233071107, 0.4285710535463766]",9
3811,500463,Jonas erstes Buch vom Leben Columbans,9401,"[German, English]","[0.8571382613354961, 0.1428578902889556]",9


In [43]:
dt_extract['Title'][2]

'Semantische Integration von Data Warehousing und Wissensmanagement'