# Trainingspipeline 40

- balanciertes Korpus
- ohne Bigramme
- vector_size: 300
- window: 15
- seed: 1

In [1]:
import codecs
import nltk
import numpy as np
import os
import pandas as pd
import re
import scipy
import spacy

from gensim.models import Word2Vec
from joblib import Parallel, delayed  
from nltk.corpus import stopwords

In [2]:
stopwords = stopwords.words('italian')
tokenizer = nltk.data.load('tokenizers/punkt/italian.pickle')

In [3]:
df = pd.read_csv('../Korpus/Korpus/corpus_balanced.csv', sep=',', encoding='utf-8')

In [4]:
df.head()

Unnamed: 0,doc,source,author,title,year,period,text_type,text,words,lemmatized_text,cleaned_tokenized_text
0,Poesia.IV.4.Testo.txt,MIDIA,Faustina Maratti Zappi,Poesie,1700.0,1700-1750,poesia,IV. 4. Rime degli Arcadi: Aglauro Cidonia (Fau...,3184.0,iv . 4 . rima del arcadi : aglauro cidonia ( f...,"[['iv'], [], ['rima', 'arcadi', 'aglauro', 'ci..."
1,Espositivi.IV.4.Testo.txt,MIDIA,Ludovico Antonio Muratori,Antichità italiane,1700.0,1700-1750,espositivo,"﻿IV. 4. Ludovico Antonio Muratori, Antichità i...",8990.0,"﻿iv . 4 . Ludovico Antonio muratori , antichit...","[['iv'], [], ['ludovico', 'antonio', 'muratori..."
2,Personali.IV.5.Testo.txt,MIDIA,Lorenzo Magalotti,Lettere odorose (1693-1705),1700.0,1700-1750,personale,"IV. 5. Lorenzo Magalotti, Lettere odorose (169...",8374.0,"iv . 5 . Lorenzo magalotti , lettere odoroso (...","[['iv'], [], ['lorenzo', 'magalotti', 'lettere..."
3,Personali.IV.15.Testo.txt,MIDIA,Pietro Giannone,Vita scritta da lui medesimo,1700.0,1700-1750,personale,[Proemio]\nPrendo a scrivere la mia vita e qua...,10118.0,[ proemio ] \n prendere a scrivere il mio vita...,"[['proemio', 'prendere', 'scrivere', 'vita', '..."
4,Personali.IV.4.Testo.txt,MIDIA,Vincenzo da Filicaia,Lettere inedite a Lorenzo Magalotti,1700.0,1700-1750,personale,"IV. 4. Vincenzo da Filicaia, Lettere inedite a...",10073.0,"iv . 4 . Vincenzo da filicaia , lettere inedit...","[['iv'], [], ['vincenzo', 'filicaia', 'lettere..."


In [5]:
df.shape

(743763, 11)

In [6]:
df.text = df.text.fillna('')
df.lemmatized_text = df.lemmatized_text.fillna('')

In [7]:
# Einzeldataframes für die Zeiträume

df_periods = dict(tuple(df.groupby(by='period')))

In [8]:
df1 = df_periods['1700-1750']
df2 = df_periods['1751-1800']
df3 = df_periods['1801-1825']
df4 = df_periods['1826-1850']
df5 = df_periods['1851-1875']
df6 = df_periods['1876-1900']
df7 = df_periods['1901-1925']
df8 = df_periods['1926-1950']
df9 = df_periods['1951-1975']
df10 = df_periods['1976-2000']
df11 = df_periods['2001-2010']
df12 = df_periods['2011-2016']
df13 = df_periods['2017-2021']

In [9]:
df9.head()

Unnamed: 0,doc,source,author,title,year,period,text_type,text,words,lemmatized_text,cleaned_tokenized_text
126313,LISRodari3.txt,LIS,Gianno Rodari,La questione dei fumetti,1951.0,1951-1975,stampa,"Caro Direttore , ho letto nell ' ultimo numero...",1510.0,"caro direttore , avere leggere nell ' ultimo n...","[['caro', 'direttore', 'avere', 'leggere', 'ul..."
126314,LISJotti1.txt,LIS,Nilde Jotti,La questione dei fumetti,1951.0,1951-1975,stampa,Il dibattito sulla stampa a fumetti per i raga...,2785.0,il dibattito sulla stampa a fumetto per il rag...,"[['dibattito', 'stampa', 'fumetto', 'ragazzo',..."
126315,LLAlbertelli1.txt,Liber Liber,Pilo Albertelli,Rousseau,1951.0,1951-1975,prosa letteraria,﻿Pilo Albertelli\nRousseau\n\n Nacque il 28 g...,4894.0,﻿pilo albertelli \n rousseau \n\n nascere il...,"[['pilo', 'albertelli', 'rousseau', 'nascere',..."
126316,LISManacorda1.txt,LIS,Gastone Manacorda,Il Partito e la sua funzione di guida nel camp...,1951.0,1951-1975,stampa,"Al partito , nel suo rigoglioso sviluppo , seg...",3460.0,"al partito , nel suo rigoglioso sviluppo , seg...","[['partito', 'rigoglioso', 'sviluppo', 'seguit..."
126317,LISBianchi1.txt,LIS,Ranuccio Bianchi Bandinelli,Il nostro lavoro nella scuola,1951.0,1951-1975,stampa,"Come in tutti i congressi , anche nel VII Cong...",2898.0,"come in tutto il congresso , anche nel vii con...","[['congresso', 'vii', 'congresso', 'p'], ['tes..."


## Training von Word2Vec

In [10]:
# Hilfsfunktionen zur Vorbereitung auf das Training
# Bereinigung und Tokenisierung

def sentence_to_wordlist(raw:str):
    """
    cleans and tokenizes the sentences
    """
    text = re.sub('[^A-Za-z_àÀèÈìÌòÒùÙáÁéÉíÍóÓúÚ]',' ', raw).split()        # Diakritika ans Italienische anpassen                    
    filtered_text = [word for word in text if word not in stopwords]        # Stopwörter löschen
    return filtered_text


def tokenize_text(raw_text):
    """
    returns a list of lowercase tokenized sentences 
    """
    raw_sentences = tokenizer.tokenize(str(raw_text).lower())    
    tokenized_sentences = Parallel(n_jobs=-1)(delayed(sentence_to_wordlist)(raw_sentence) for raw_sentence in raw_sentences)
    sentences = tokenized_sentences
    return sentences

In [11]:
# Trainingsparamter setzen

vector_size = 300                  # Dimensionality of the word vectors
window = 15                        # The maximum distance between the current and predicted word within a sentence
min_count = 2                      # (int, optional) – The model ignores all words with total frequency lower than this
workers = 1                        # Use these many worker threads to train the model (faster training with multicore machines)
min_alpha = 0.0001                 # Learning rate will linearly drop to min_alpha as training progresses
sg = 1                             # Training algorithm: skip-gram if sg=1, otherwise CBOW            
seed = 1                           # Reproductivity --> only if workers = 1

In [12]:
# Ordner anlegen zum Abspeichern von trainierten Modellen

if not os.path.exists('../trained_models'):
    os.makedirs('../trained_models')

### Zeitraum 1: 1700-1750

In [13]:
# lemmatisierte Texte zu einem String verbinden

text1 = ''

for i in df1.lemmatized_text:
    text1 += i

In [14]:
%%time
sentences1 = tokenize_text(text1)         # Bereinigen, Tokenisieren und in Form bringen (Ziel: Liste von tokenisierten Sätzen)

Wall time: 21.9 s


In [15]:
%%time

# Training   

w2v1 = Word2Vec(sentences=sentences1,                      
                vector_size=vector_size,          
                window=window,                
                min_count=min_count,              
                workers=workers, 
                min_alpha=min_alpha,         
                sg=sg,                     
                seed=seed)

Wall time: 5min 33s


In [16]:
w2v1.wv.most_similar(positive=['terrore'], topn=10)

[('turchesche', 0.5449481010437012),
 ('lodigiano', 0.53666090965271),
 ('avanzò', 0.5358530879020691),
 ('bavaresi', 0.5295931100845337),
 ('riempiè', 0.527573823928833),
 ('commozione', 0.5225040316581726),
 ('carmagnola', 0.5209928154945374),
 ('spavento', 0.5143598914146423),
 ('impadronitosi', 0.5131425261497498),
 ('costernazione', 0.5126888751983643)]

In [17]:
# trainiertes Modell speichern

w2v1.save(os.path.join('../trained_models/Word2Vec40', '40w2v1.model'))

### Zeitraum 2: 1751-1800

In [18]:
text2 = ''

for i in df2.lemmatized_text:
    text2 += i

In [19]:
%%time
sentences2 = tokenize_text(text2)

Wall time: 30.7 s


In [20]:
%%time
w2v2 = Word2Vec(sentences=sentences2,                   
                vector_size=vector_size,          
                window=window,                
                min_count=min_count,              
                workers=workers, 
                min_alpha=min_alpha,         
                sg=sg,                     
                seed=seed)

Wall time: 6min 2s


In [21]:
w2v2.wv.most_similar(positive=['terrore'], topn=10)

[('ostile', 0.6011592149734497),
 ('timur', 0.596372663974762),
 ('inspirare', 0.5947270393371582),
 ('legione', 0.5836865305900574),
 ('agghiadando', 0.5806405544281006),
 ('crociati', 0.5787096619606018),
 ('devastazione', 0.5780643224716187),
 ('formidabil', 0.5735940933227539),
 ('invisibili', 0.5637351870536804),
 ('secondarono', 0.5532472133636475)]

In [22]:
w2v2.save(os.path.join('../trained_models/Word2Vec40', '40w2v2.model'))

### Zeitraum 3: 1801-1825

In [23]:
text3 = ''

for i in df3.lemmatized_text:
    text3 += i

In [24]:
%%time
sentences3 = tokenize_text(text3)

Wall time: 26.2 s


In [25]:
%%time
w2v3 = Word2Vec(sentences=sentences3,                   
                vector_size=vector_size,          
                window=window,                
                min_count=min_count,              
                workers=workers, 
                min_alpha=min_alpha,         
                sg=sg,                     
                seed=seed)

Wall time: 5min 45s


In [26]:
w2v3.wv.most_similar(positive=['terrore'], topn=10)

[('repentino', 0.6435782313346863),
 ('fremè', 0.6033051609992981),
 ('impadronisce', 0.5937840938568115),
 ('aspettò', 0.5876455307006836),
 ('imaginari', 0.5873774290084839),
 ('sovrannaturale', 0.5851864814758301),
 ('innalza', 0.5797820091247559),
 ('sorpresa', 0.5764471888542175),
 ('ritrarnelo', 0.5564179420471191),
 ('rammarico', 0.5516132712364197)]

In [27]:
w2v3.save(os.path.join('../trained_models/Word2Vec40', '40w2v3.model'))

### Zeitraum 4: 1826-1850

In [28]:
text4 = ''

for i in df4.lemmatized_text:
    text4 += i

In [29]:
%%time
sentences4 = tokenize_text(text4)

Wall time: 27.1 s


In [30]:
%%time
w2v4 = Word2Vec(sentences=sentences4,                   
                vector_size=vector_size,          
                window=window,                
                min_count=min_count,              
                workers=workers, 
                min_alpha=min_alpha,         
                sg=sg,                     
                seed=seed)

Wall time: 5min 39s


In [31]:
w2v4.wv.most_similar(positive=['terrore'], topn=10)

[('panico', 0.5334613919258118),
 ('bentosto', 0.5279020667076111),
 ('inspirare', 0.5195204615592957),
 ('augereau', 0.49806907773017883),
 ('aristocrazìa', 0.49416160583496094),
 ('villetard', 0.4853004515171051),
 ('incusso', 0.4846210479736328),
 ('cagionare', 0.4836124777793884),
 ('riurto', 0.4775114953517914),
 ('scompiglio', 0.47573307156562805)]

In [32]:
w2v4.save(os.path.join('../trained_models/Word2Vec40', '40w2v4.model'))

### Zeitraum 5: 1851-1875

In [33]:
text5 = ''

for i in df5.lemmatized_text:
    text5 += i

In [34]:
%%time
sentences5 = tokenize_text(text5)

Wall time: 30.8 s


In [35]:
%%time
w2v5 = Word2Vec(sentences=sentences5,                   
                vector_size=vector_size,          
                window=window,                
                min_count=min_count,              
                workers=workers, 
                min_alpha=min_alpha,         
                sg=sg,                     
                seed=seed)

Wall time: 4min 56s


In [36]:
w2v5.wv.most_similar(positive=['terrore'], topn=10)

[('sgomento', 0.523679256439209),
 ('agonia', 0.5085601806640625),
 ('incutere', 0.5031141042709351),
 ('ferocia', 0.4934081733226776),
 ('immaginava', 0.49203574657440186),
 ('smanioso', 0.48749709129333496),
 ('ilil', 0.4874187409877777),
 ('rammarichío', 0.48602941632270813),
 ('balzelloni', 0.4840412735939026),
 ('orrore', 0.4827539324760437)]

In [37]:
w2v5.save(os.path.join('../trained_models/Word2Vec40', '40w2v5.model'))

### Zeitraum 6: 1876-1900

In [38]:
text6 = ''

for i in df6.lemmatized_text:
    text6 += i

In [39]:
%%time
sentences6 = tokenize_text(text6)

Wall time: 30.9 s


In [40]:
%%time
w2v6 = Word2Vec(sentences=sentences6,                   
                vector_size=vector_size,          
                window=window,                
                min_count=min_count,              
                workers=workers, 
                min_alpha=min_alpha,         
                sg=sg,                     
                seed=seed)

Wall time: 4min 16s


In [41]:
w2v6.wv.most_similar(positive=['terrore'], topn=10)

[('spavento', 0.4669657349586487),
 ('superstizioso', 0.46506619453430176),
 ('calamità', 0.4564853012561798),
 ('forsennato', 0.44041991233825684),
 ('imputato', 0.43729904294013977),
 ('carneficina', 0.43668514490127563),
 ('territio', 0.4300159513950348),
 ('atterrì', 0.42792680859565735),
 ('spaventoso', 0.42791056632995605),
 ('consideravasi', 0.4210168421268463)]

In [42]:
w2v6.save(os.path.join('../trained_models/Word2Vec40', '40w2v6.model'))

### Zeitraum 7: 1901-1925

In [43]:
text7 = ''

for i in df7.lemmatized_text:
    text7 += i

In [44]:
%%time
sentences7 = tokenize_text(text7)

Wall time: 28.9 s


In [45]:
%%time
w2v7 = Word2Vec(sentences=sentences7,                   
                vector_size=vector_size,          
                window=window,                
                min_count=min_count,              
                workers=workers, 
                min_alpha=min_alpha,         
                sg=sg,                     
                seed=seed)

Wall time: 4min 20s


In [46]:
w2v7.wv.most_similar(positive=['terrore'], topn=10)

[('sudar', 0.46295544505119324),
 ('gelare', 0.45987361669540405),
 ('sfuggito', 0.454312801361084),
 ('orrore', 0.4411762058734894),
 ('accesso', 0.43962740898132324),
 ('efferato', 0.43903422355651855),
 ('inopinatamente', 0.4357953667640686),
 ('stremare', 0.4342036843299866),
 ('approssimarsi', 0.4321115016937256),
 ('giovanezza', 0.4310533404350281)]

In [47]:
w2v7.save(os.path.join('../trained_models/Word2Vec40', '40w2v7.model'))

### Zeitraum 8: 1926-1950

In [48]:
text8 = ''

for i in df8.lemmatized_text:
    text8 += i

In [49]:
%%time
sentences8 = tokenize_text(text8)

Wall time: 26.2 s


In [50]:
%%time
w2v8 = Word2Vec(sentences=sentences8,                   
                vector_size=vector_size,          
                window=window,                
                min_count=min_count,              
                workers=workers, 
                min_alpha=min_alpha,         
                sg=sg,                     
                seed=seed)

Wall time: 4min 37s


In [51]:
w2v8.wv.most_similar(positive=['terrore'], topn=10)

[('rabbico', 0.5343318581581116),
 ('virus', 0.523741602897644),
 ('innestato', 0.498747855424881),
 ('agghiacciò', 0.4925477206707001),
 ('impietrire', 0.4903981387615204),
 ('canino', 0.4815654158592224),
 ('pasticcetti', 0.4803818464279175),
 ('frammettendosi', 0.4795314371585846),
 ('fours', 0.4747002422809601),
 ('paralizzare', 0.47211429476737976)]

In [52]:
w2v8.save(os.path.join('../trained_models/Word2Vec40', '40w2v8.model'))

### Zeitraum 9: 1951-1975

In [53]:
text9 = ''

for i in df9.lemmatized_text:
    text9 += i

In [54]:
%%time
sentences9 = tokenize_text(text9)

Wall time: 23.8 s


In [55]:
%%time
w2v9 = Word2Vec(sentences=sentences9,                   
                vector_size=vector_size,          
                window=window,                
                min_count=min_count,              
                workers=workers, 
                min_alpha=min_alpha,         
                sg=sg,                     
                seed=seed)

Wall time: 4min 35s


In [56]:
w2v9.wv.most_similar(positive=['terrore'], topn=10)

[('malefico', 0.5923131108283997),
 ('insano', 0.5851815938949585),
 ('guarentisce', 0.5683953762054443),
 ('assopita', 0.5673373937606812),
 ('involare', 0.5580342411994934),
 ('erastus', 0.5559908747673035),
 ('favilla', 0.5555555820465088),
 ('piccolissimo', 0.5522016882896423),
 ('orrore', 0.5487774610519409),
 ('spasmodicamente', 0.5479488968849182)]

In [57]:
w2v9.save(os.path.join('../trained_models/Word2Vec40', '40w2v9.model'))

### Zeitraum 10: 1976-2000

In [58]:
text10 = ''

for i in df10.lemmatized_text:
    text10+= i

In [59]:
%%time
sentences10 = tokenize_text(text10)

Wall time: 26.4 s


In [60]:
%%time
w2v10 = Word2Vec(sentences=sentences10,                   
                vector_size=vector_size,          
                window=window,                
                min_count=min_count,              
                workers=workers, 
                min_alpha=min_alpha,         
                sg=sg,                     
                seed=seed)

Wall time: 4min 19s


In [61]:
w2v10.wv.most_similar(positive=['terrore'], topn=10)

[('staliniano', 0.5585861802101135),
 ('pol', 0.555791437625885),
 ('bombarolo', 0.5481665730476379),
 ('autobomba', 0.5478269457817078),
 ('deportazione', 0.5416848063468933),
 ('mietere', 0.5411224365234375),
 ('terrorismo', 0.5408968329429626),
 ('suicida', 0.5405609607696533),
 ('maoista', 0.5402685403823853),
 ('fratricida', 0.5391478538513184)]

In [62]:
w2v10.save(os.path.join('../trained_models/Word2Vec40', '40w2v10.model'))

### Zeitraum 11: 2001-2010

In [63]:
text11 = ''

for i in df11.lemmatized_text:
    text11+= i

In [64]:
%%time
sentences11 = tokenize_text(text11)

Wall time: 23 s


In [65]:
%%time
w2v11 = Word2Vec(sentences=sentences11,                   
                vector_size=vector_size,          
                window=window,                
                min_count=min_count,              
                workers=workers, 
                min_alpha=min_alpha,         
                sg=sg,                     
                seed=seed)

Wall time: 5min 51s


In [66]:
w2v11.wv.most_similar(positive=['terrore'], topn=10)

[('barbarie', 0.5392811894416809),
 ('oppressione', 0.5373106002807617),
 ('fredda', 0.5351439118385315),
 ('feroce', 0.534885585308075),
 ('insinuare', 0.5345460176467896),
 ('tiranno', 0.5345378518104553),
 ('fondamentalista', 0.5338853597640991),
 ('fanatismo', 0.5332879424095154),
 ('irrazionale', 0.5317484736442566),
 ('irriducibile', 0.52961266040802)]

In [67]:
w2v11.save(os.path.join('../trained_models/Word2Vec40', '40w2v11.model'))

### Zeitraum 12: 2011-2016

In [68]:
text12 = ''

for i in df12.lemmatized_text:
    text12+= i

In [69]:
%%time
sentences12 = tokenize_text(text12)

Wall time: 19.1 s


In [70]:
%%time
w2v12 = Word2Vec(sentences=sentences12,                   
                vector_size=vector_size,          
                window=window,                
                min_count=min_count,              
                workers=workers, 
                min_alpha=min_alpha,         
                sg=sg,                     
                seed=seed)

Wall time: 6min 12s


In [71]:
w2v12.wv.most_similar(positive=['terrore'], topn=10)

[('ripiombare', 0.6037869453430176),
 ('seminare', 0.5989764332771301),
 ('lessico', 0.5811809301376343),
 ('incutere', 0.5759825706481934),
 ('cherry', 0.5750294923782349),
 ('smileys', 0.5644280910491943),
 ('plum', 0.5637722611427307),
 ('trapasso', 0.563490092754364),
 ('desolazione', 0.5593907833099365),
 ('cieco', 0.557325005531311)]

In [72]:
w2v12.save(os.path.join('../trained_models/Word2Vec40', '40w2v12.model'))

### Zeitraum 13: 2017-2021

In [73]:
text13 = ''

for i in df13.lemmatized_text:
    text13+= i

In [74]:
%%time
sentences13 = tokenize_text(text13)

Wall time: 18.9 s


In [75]:
%%time
w2v13 = Word2Vec(sentences=sentences13,                   
                vector_size=vector_size,          
                window=window,                
                min_count=min_count,              
                workers=workers, 
                min_alpha=min_alpha,         
                sg=sg,                     
                seed=seed)

Wall time: 6min 18s


In [76]:
w2v13.wv.most_similar(positive=['terrore'], topn=10)

[('incutere', 0.6380201578140259),
 ('orrore', 0.6357980966567993),
 ('disorientamento', 0.6223702430725098),
 ('sadismo', 0.6092866659164429),
 ('impotenza', 0.6058338284492493),
 ('lovecraft', 0.6026023030281067),
 ('oppressione', 0.6009206175804138),
 ('giacobino', 0.5950257182121277),
 ('desolazione', 0.59151291847229),
 ('jihad', 0.5880140662193298)]

In [77]:
w2v13.save(os.path.join('../trained_models/Word2Vec40', '40w2v13.model'))