# Trainingspipeline

In [1]:
import codecs
import nltk
import numpy as np
import os
import pandas as pd
import re
import scipy
import spacy

from gensim.models.phrases import Phraser, Phrases
from gensim.models import KeyedVectors
from gensim.models import Word2Vec
from joblib import Parallel, delayed  
from nltk.corpus import stopwords
from tabulate import tabulate

In [2]:
stopwords = stopwords.words('italian')
tokenizer = nltk.data.load('tokenizers/punkt/italian.pickle')

In [3]:
df = pd.read_csv('../Korpus/Korpus/corpus_final.csv', sep=',', encoding='utf-8')

In [4]:
df.rename({'lemmatized text': 'lemmatized_text', 'cleaned tokenized text': 'cleaned_tokenized_text'}, axis=1, inplace=True)

In [5]:
df.head()

Unnamed: 0,doc,source,author,title,year,period,text type,text,words,lemmatized_text,cleaned_tokenized_text
0,Espositivi.IV.4.Testo.txt,MIDIA,Ludovico Antonio Muratori,Antichità italiane,1700.0,1700-1750,espositivo,"﻿IV. 4. Ludovico Antonio Muratori, Antichità i...",8990.0,"﻿iv . 4 . Ludovico Antonio muratori , antichit...","[['iv'], [], ['ludovico', 'antonio', 'muratori..."
1,Poesia.IV.1.Testo.txt,MIDIA,Giuseppe Paolucci (Alessi Cillenio),Poesie,1700.0,1700-1750,poesia,IV. 1. Rime degli Arcadi: Alessi Cillenio (Giu...,10862.0,iv . 1 . rima del arcadi : alessi cillenio ( G...,"[['iv'], [], ['rima', 'arcadi', 'alessi', 'cil..."
2,Personali.IV.4.Testo.txt,MIDIA,Vincenzo da Filicaia,Lettere inedite a Lorenzo Magalotti,1700.0,1700-1750,personale,"IV. 4. Vincenzo da Filicaia, Lettere inedite a...",10073.0,"iv . 4 . Vincenzo da filicaia , lettere inedit...","[['iv'], [], ['vincenzo', 'filicaia', 'lettere..."
3,Personali.IV.5.Testo.txt,MIDIA,Lorenzo Magalotti,Lettere odorose (1693-1705),1700.0,1700-1750,personale,"IV. 5. Lorenzo Magalotti, Lettere odorose (169...",8374.0,"iv . 5 . Lorenzo magalotti , lettere odoroso (...","[['iv'], [], ['lorenzo', 'magalotti', 'lettere..."
4,Poesia.IV.4.Testo.txt,MIDIA,Faustina Maratti Zappi,Poesie,1700.0,1700-1750,poesia,IV. 4. Rime degli Arcadi: Aglauro Cidonia (Fau...,3184.0,iv . 4 . rima del arcadi : aglauro cidonia ( f...,"[['iv'], [], ['rima', 'arcadi', 'aglauro', 'ci..."


In [6]:
df.shape

(697296, 11)

In [7]:
df.text = df.text.fillna('')
df.lemmatized_text = df.lemmatized_text.fillna('')

In [8]:
# Einzeldataframes für die Zeiträume

df_periods = dict(tuple(df.groupby(by='period')))

In [9]:
df1 = df_periods['1700-1750']
df2 = df_periods['1751-1800']
df3 = df_periods['1801-1825']
df4 = df_periods['1826-1850']
df5 = df_periods['1851-1875']
df6 = df_periods['1876-1900']
df7 = df_periods['1901-1925']
df8 = df_periods['1926-1950']
df9 = df_periods['1951-1975']
df10 = df_periods['1976-2000']
df11 = df_periods['2001-2010']
df12 = df_periods['2011-2016']
df13 = df_periods['2017-2021']

In [10]:
df9.head()

Unnamed: 0,doc,source,author,title,year,period,text type,text,words,lemmatized_text,cleaned_tokenized_text
16797,GBspavento.csv,Gutenberg,Vittorio Imbriani,La novellaja fiorentina: Fiabe e novelline ste...,1876.0,1876-1900,prosa letteraria,", E dissegli:–""Madonna, io son contento ""D''es...",43.0,", e dissegli:–""madonna , io essere contento "" ...","[['dissegli', 'madonna', 'essere', 'contento',..."
16798,GBpaura_randomsample.csv,Gutenberg,Vittorio Imbriani,Mastr''Impicca,1876.0,1876-1900,prosa letteraria,"... Io.... Lei.... Come qui?""--rispose la Rosm...",37.0,"... io .... lei .... come qui?""--rispose il ro...","[['qui'], ['rispose', 'rosmunda', 'ancora', 'r..."
16799,GBpaura_randomsample.csv,Gutenberg,Vittorio Imbriani,Mastr''Impicca,1876.0,1876-1900,prosa letteraria,", perchè alcuni razzi del fuoco d''artificio p...",40.0,", perchè alcun razzo del fuoco d''artificio pr...","[['perchè', 'alcun', 'razzo', 'fuoco', 'd', 'a..."
16800,GBpaura_randomsample.csv,Gutenberg,Vittorio Imbriani,Mastr''Impicca,1876.0,1876-1900,prosa letteraria,"di tutti gli Scaricabarilesi, che non c''è dim...",40.0,"di tutto il scaricabarilesi , che non c''è dim...","[['scaricabarilesi', 'dimostrazion', 'd', 'oss..."
16801,GBpaura_randomsample.csv,Gutenberg,Vittorio Imbriani,Mastr''Impicca,1876.0,1876-1900,prosa letteraria,"ne dice la Maestà del despota d''Exibo?"". Don ...",40.0,"ne dire il maestà del despota d''exibo ? "" . D...","[['dire', 'maestà', 'despota', 'd', 'exibo'], ..."


## Training von Word2Vec

In [11]:
# Hilfsfunktionen zur Vorbereitung auf das Training
# Bereinigung und Tokenisierung

def sentence_to_wordlist(raw:str):
    """
    cleans and tokenizes the sentences
    """
    text = re.sub('[^A-Za-z_àÀèÈìÌòÒùÙáÁéÉíÍóÓúÚ]',' ', raw).split()        # Diakritika ans Italienische anpassen                    
    filtered_text = [word for word in text if word not in stopwords]        # Stopwörter löschen
    return filtered_text


def tokenize_text(raw_text):
    """
    returns a list of lowercase tokenized sentences 
    """
    raw_sentences = tokenizer.tokenize(str(raw_text).lower())    
    tokenized_sentences = Parallel(n_jobs=-1)(delayed(sentence_to_wordlist)(raw_sentence) for raw_sentence in raw_sentences)
    phrases = Phrases(tokenized_sentences)
    bigram = Phraser(phrases)
    sentences = list(bigram[tokenized_sentences])
    return sentences

In [12]:
# Trainingsparamter setzen

vector_size = 300                  # Dimensionality of the word vectors
window = 10                        # The maximum distance between the current and predicted word within a sentence
min_count = 2                      # (int, optional) – The model ignores all words with total frequency lower than this
workers = 4                        # Use these many worker threads to train the model (=faster training with multicore machines)
min_alpha = 0.0001                 # Learning rate will linearly drop to min_alpha as training progresses
sg = 1                             # Training algorithm: skip-gram if sg=1, otherwise CBOW            
seed = 42                          # Reproductivity (42 just because...)

In [13]:
# Ordner anlegen zum Abspeichern von trainierten Modellen

if not os.path.exists('../trained_models'):
    os.makedirs('../trained_models')

### Zeitraum 1: 1700-1750

In [14]:
# lemmatisierte Texte zu einem String verbinden

text1 = ''

for i in df1.lemmatized_text:
    text1 += i

In [17]:
%%time
sentences1 = tokenize_text(text1)         # Bereinigen, Tokenisieren und in Form bringen (Ziel: Liste von tokenisierten Sätzen)

Wall time: 20.3 s


In [18]:
print(sentences1[:5])

[['iv'], [], ['ludovico', 'antonio', 'muratori', 'antichità', 'italiano', 'dissertazione', 'gente', 'barbaro', 'assoggettare', 'italia'], ['oggetto', 'ammirazione', 'essere', 'antico', 'tempo', 'roma', 'roma', 'stendere', 'imperio', 'già', 'sopra', 'terra', 'alcun', 'scrittore', 'adulatoriamente', 'scrivere', 'volta', 'sì', 'bene', 'sopra', 'gran_parte', 'tre_parto', 'allora', 'conoscere', 'terra'], ['tanto', 'potenza', 'niuna', 'essere', 'mai', 'giungere', 'precedente', 'monarchia']]


In [19]:
len(sentences1)

82676

In [20]:
%%time

# Training   

w2v1 = Word2Vec(sentences=sentences1,                      
                vector_size=vector_size,          
                window=window,                
                min_count=min_count,              
                workers=workers, 
                min_alpha=min_alpha,         
                sg=sg,                     
                seed=seed)

Wall time: 53.7 s


In [21]:
w2v1.wv.most_similar(positive=['terrore'], topn=10)

[('spavento', 0.8295801281929016),
 ('tal_terrore', 0.7880813479423523),
 ('gran_terrore', 0.7854892611503601),
 ('barbari', 0.7742155194282532),
 ('goti', 0.7736718654632568),
 ('ribellare', 0.7582488656044006),
 ('de_goti', 0.7563993334770203),
 ('spaventare', 0.7506831884384155),
 ('faceano', 0.7486810088157654),
 ('brescia', 0.745989203453064)]

In [None]:
# trainiertes Modell speichern

w2v1.save(os.path.join('../trained_models', 'w2v1.model'))

### Zeitraum 2: 1751-1800

In [None]:
text2 = ''

for i in df2.lemmatized_text:
    text2 += i

In [None]:
%%time
sentences2 = tokenize_text(text2)

In [None]:
print(sentences2[:5])

In [None]:
len(sentences2)

In [None]:
%%time
w2v2 = Word2Vec(sentences=sentences2,                   
                vector_size=vector_size,          
                window=window,                
                min_count=min_count,              
                workers=workers, 
                min_alpha=min_alpha,         
                sg=sg,                     
                seed=seed)

In [None]:
w2v2.wv.most_similar(positive=['terrore'], topn=10)

In [None]:
w2v2.save(os.path.join('../trained_models', 'w2v2.model'))

### Zeitraum 3: 1801-1825

In [None]:
text3 = ''

for i in df3.lemmatized_text:
    text3 += i

In [None]:
%%time
sentences3 = tokenize_text(text3)

In [None]:
print(sentences3[:5])

In [None]:
len(sentences3)

In [None]:
%%time
w2v3 = Word2Vec(sentences=sentences3,                   
                vector_size=vector_size,          
                window=window,                
                min_count=min_count,              
                workers=workers, 
                min_alpha=min_alpha,         
                sg=sg,                     
                seed=seed)

In [None]:
w2v3.wv.most_similar(positive=['terrore'], topn=10)

In [None]:
w2v3.save(os.path.join('../trained_models', 'w2v3.model'))

### Zeitraum 4: 1826-1850

In [None]:
text4 = ''

for i in df4.lemmatized_text:
    text4 += i

In [None]:
%%time
sentences4 = tokenize_text(text4)

In [None]:
print(sentences4[:5])

In [None]:
len(sentences4)

In [None]:
%%time
w2v4 = Word2Vec(sentences=sentences4,                   
                vector_size=vector_size,          
                window=window,                
                min_count=min_count,              
                workers=workers, 
                min_alpha=min_alpha,         
                sg=sg,                     
                seed=seed)

In [None]:
w2v4.wv.most_similar(positive=['terrore'], topn=10)

In [None]:
w2v4.save(os.path.join('../trained_models', 'w2v4.model'))

### Zeitraum 5: 1851-1875

In [None]:
text5 = ''

for i in df5.lemmatized_text:
    text5 += i

In [None]:
%%time
sentences5 = tokenize_text(text5)

In [None]:
print(sentences5[:5])

In [None]:
len(sentences5)

In [None]:
%%time
w2v5 = Word2Vec(sentences=sentences5,                   
                vector_size=vector_size,          
                window=window,                
                min_count=min_count,              
                workers=workers, 
                min_alpha=min_alpha,         
                sg=sg,                     
                seed=seed)

In [None]:
w2v5.wv.most_similar(positive=['terrore'], topn=10)

In [None]:
w2v5.save(os.path.join('../trained_models', 'w2v5.model'))

### Zeitraum 6: 1876-1900

In [None]:
text6 = ''

for i in df6.lemmatized_text:
    text6 += i

In [None]:
%%time
sentences6 = tokenize_text(text6)

In [None]:
print(sentences6[:5])

In [None]:
len(sentences6)

In [None]:
%%time
w2v6 = Word2Vec(sentences=sentences6,                   
                vector_size=vector_size,          
                window=window,                
                min_count=min_count,              
                workers=workers, 
                min_alpha=min_alpha,         
                sg=sg,                     
                seed=seed)

In [None]:
w2v6.wv.most_similar(positive=['terrore'], topn=10)

In [None]:
w2v6.save(os.path.join('../trained_models', 'w2v6.model'))

### Zeitraum 7: 1901-1925

In [None]:
text7 = ''

for i in df7.lemmatized_text:
    text7 += i

In [None]:
%%time
sentences7 = tokenize_text(text7)

In [None]:
print(sentences7[:5])

In [None]:
len(sentences7)

In [None]:
%%time
w2v7 = Word2Vec(sentences=sentences7,                   
                vector_size=vector_size,          
                window=window,                
                min_count=min_count,              
                workers=workers, 
                min_alpha=min_alpha,         
                sg=sg,                     
                seed=seed)

In [None]:
w2v7.wv.most_similar(positive=['terrore'], topn=10)

In [None]:
w2v7.save(os.path.join('../trained_models', 'w2v7.model'))

### Zeitraum 8: 1926-1950

In [None]:
text8 = ''

for i in df8.lemmatized_text:
    text8 += i

In [None]:
%%time
sentences8 = tokenize_text(text8)

In [None]:
print(sentences8[:5])

In [None]:
len(sentences8)

In [None]:
%%time
w2v8 = Word2Vec(sentences=sentences8,                   
                vector_size=vector_size,          
                window=window,                
                min_count=min_count,              
                workers=workers, 
                min_alpha=min_alpha,         
                sg=sg,                     
                seed=seed)

In [None]:
w2v8.wv.most_similar(positive=['terrore'], topn=10)

In [None]:
w2v8.save(os.path.join('../trained_models', 'w2v8.model'))

### Zeitraum 9: 1951-1975

In [None]:
text9 = ''

for i in df9.lemmatized_text:
    text9 += i

In [None]:
%%time
sentences9 = tokenize_text(text9)

In [None]:
print(sentences9[:5])

In [None]:
len(sentences9)

In [None]:
%%time
w2v9 = Word2Vec(sentences=sentences9,                   
                vector_size=vector_size,          
                window=window,                
                min_count=min_count,              
                workers=workers, 
                min_alpha=min_alpha,         
                sg=sg,                     
                seed=seed)

In [None]:
w2v9.wv.most_similar(positive=['terrore'], topn=10)

In [None]:
w2v9.save(os.path.join('../trained_models', 'w2v9.model'))

### Zeitraum 10: 1976-2000

In [None]:
text10 = ''

for i in df10.lemmatized_text:
    text10+= i

In [None]:
%%time
sentences10 = tokenize_text(text10)

In [None]:
print(sentences10[:5])

In [None]:
len(sentences10)

In [None]:
%%time
w2v10 = Word2Vec(sentences=sentences10,                   
                vector_size=vector_size,          
                window=window,                
                min_count=min_count,              
                workers=workers, 
                min_alpha=min_alpha,         
                sg=sg,                     
                seed=seed)

In [None]:
w2v10.wv.most_similar(positive=['terrore'], topn=10)

In [None]:
w2v10.save(os.path.join('../trained_models', 'w2v10.model'))

### Zeitraum 11: 2001-2010

In [None]:
text11 = ''

for i in df11.lemmatized_text:
    text11+= i

In [None]:
%%time
sentences11 = tokenize_text(text11)

In [None]:
print(sentences11[:5])

In [None]:
len(sentences11)

In [None]:
%%time
w2v11 = Word2Vec(sentences=sentences11,                   
                vector_size=vector_size,          
                window=window,                
                min_count=min_count,              
                workers=workers, 
                min_alpha=min_alpha,         
                sg=sg,                     
                seed=seed)

In [None]:
w2v11.wv.most_similar(positive=['terrore'], topn=10)

In [None]:
w2v11.save(os.path.join('../trained_models', 'w2v11.model'))

### Zeitraum 12: 2011-2016

In [None]:
text12 = ''

for i in df12.lemmatized_text:
    text12+= i

In [None]:
%%time
sentences12 = tokenize_text(text12)

In [None]:
print(sentences12[:5])

In [None]:
len(sentences12)

In [None]:
%%time
w2v12 = Word2Vec(sentences=sentences12,                   
                vector_size=vector_size,          
                window=window,                
                min_count=min_count,              
                workers=workers, 
                min_alpha=min_alpha,         
                sg=sg,                     
                seed=seed)

In [None]:
w2v12.wv.most_similar(positive=['terrore'], topn=10)

In [None]:
w2v12.save(os.path.join('../trained_models', 'w2v12.model'))

### Zeitraum 13: 2017-2021

In [None]:
text13 = ''

for i in df13.lemmatized_text:
    text13+= i

In [None]:
%%time
sentences13 = tokenize_text(text13)

In [None]:
print(sentences13[:5])

In [None]:
len(sentences13)

In [None]:
%%time
w2v13 = Word2Vec(sentences=sentences13,                   
                vector_size=vector_size,          
                window=window,                
                min_count=min_count,              
                workers=workers, 
                min_alpha=min_alpha,         
                sg=sg,                     
                seed=seed)

In [None]:
w2v13.wv.most_similar(positive=['terrore'], topn=10)

In [None]:
w2v13.save(os.path.join('../trained_models', 'w2v13.model'))