In [1]:
import codecs
from collections import Counter
import nltk
import numpy as np
import os
import pandas as pd
import re
import scipy
import spacy

In [2]:
df = pd.read_csv('../Korpus/Korpus/corpus_final.csv', sep=',', encoding='utf-8')

In [6]:
df.rename({'lemmatized text': 'lemmatized_text', 'cleaned tokenized text': 'cleaned_tokenized_text'}, axis=1, inplace=True)

In [7]:
df.head()

Unnamed: 0,doc,source,author,title,year,period,text type,text,words,lemmatized_text,cleaned_tokenized_text
0,Espositivi.IV.4.Testo.txt,MIDIA,Ludovico Antonio Muratori,Antichità italiane,1700.0,1700-1750,espositivo,"﻿IV. 4. Ludovico Antonio Muratori, Antichità i...",8990.0,"﻿iv . 4 . Ludovico Antonio muratori , antichit...","[['iv'], [], ['ludovico', 'antonio', 'muratori..."
1,Poesia.IV.1.Testo.txt,MIDIA,Giuseppe Paolucci (Alessi Cillenio),Poesie,1700.0,1700-1750,poesia,IV. 1. Rime degli Arcadi: Alessi Cillenio (Giu...,10862.0,iv . 1 . rima del arcadi : alessi cillenio ( G...,"[['iv'], [], ['rima', 'arcadi', 'alessi', 'cil..."
2,Personali.IV.4.Testo.txt,MIDIA,Vincenzo da Filicaia,Lettere inedite a Lorenzo Magalotti,1700.0,1700-1750,personale,"IV. 4. Vincenzo da Filicaia, Lettere inedite a...",10073.0,"iv . 4 . Vincenzo da filicaia , lettere inedit...","[['iv'], [], ['vincenzo', 'filicaia', 'lettere..."
3,Personali.IV.5.Testo.txt,MIDIA,Lorenzo Magalotti,Lettere odorose (1693-1705),1700.0,1700-1750,personale,"IV. 5. Lorenzo Magalotti, Lettere odorose (169...",8374.0,"iv . 5 . Lorenzo magalotti , lettere odoroso (...","[['iv'], [], ['lorenzo', 'magalotti', 'lettere..."
4,Poesia.IV.4.Testo.txt,MIDIA,Faustina Maratti Zappi,Poesie,1700.0,1700-1750,poesia,IV. 4. Rime degli Arcadi: Aglauro Cidonia (Fau...,3184.0,iv . 4 . rima del arcadi : aglauro cidonia ( f...,"[['iv'], [], ['rima', 'arcadi', 'aglauro', 'ci..."


In [8]:
df.shape

(697296, 11)

In [9]:
df.text = df.text.fillna('')
df.lemmatized_text = df.lemmatized_text.fillna('')

In [10]:
# Teilframes pro Zeitraum erstellen

period_frames = []
periods = list(df['period'].unique())
df_periods = dict(tuple(df.groupby(by='period')))

for df, period in zip(df_periods, periods):
    df_period = df_periods[period]
    period_frames.append(df_period)

In [15]:
# Funktion, die bestimmte Wörter zählt (im lemmatisierten Text)

def count_keywords(df, keywords):
    keyword_dict = {w:w.replace(' ', '_') for w in keywords}

    corpus = ' '.join(df.lemmatized_text).lower()
    for w,w2 in keyword_dict.items():
        corpus = corpus.replace(w,w2)

    all_counts = Counter(corpus.split())
    final_counts = {w:all_counts[w2] for w,w2 in keyword_dict.items()}
    return final_counts

In [16]:
# mögliche Kandidaten

keywords = ['atrocemente', 'formidabilmente', 'mostruosamente', 'orrendamente', 
            'orribilmente', 'spaventosamente', 'terribilmente', 'tremendamente', 
            'molto', 'angoscia', 'angosciare', 'ansia', 'atterrire', 'panico', 
            'paura', 'raccapricciare', 'raccapriccio', 'schifo', 'sgomentare', 
            'sgomento', 'spaventare', 'spavento', 'terrore', 'domani', 'finora', 
            'ieri', 'oggi', 'presto', 'raramente', 'sempre', 'spesso']

In [17]:
candidates = {}

for df, period in zip(period_frames, periods):
    res = count_keywords(df, keywords)
    candidates[period] = res

In [18]:
candidates

{'1700-1750': {'atrocemente': 2,
  'formidabilmente': 0,
  'mostruosamente': 2,
  'orrendamente': 1,
  'orribilmente': 20,
  'spaventosamente': 9,
  'terribilmente': 13,
  'tremendamente': 0,
  'molto': 11215,
  'angoscia': 25,
  'angosciare': 0,
  'ansia': 1,
  'atterrire': 202,
  'panico': 22,
  'paura': 583,
  'raccapricciare': 7,
  'raccapriccio': 1,
  'schifo': 11,
  'sgomentare': 61,
  'sgomento': 7,
  'spaventare': 136,
  'spavento': 391,
  'terrore': 597,
  'domani': 56,
  'finora': 444,
  'ieri': 25,
  'oggi': 913,
  'presto': 509,
  'raramente': 9,
  'sempre': 2269,
  'spesso': 814},
 '1751-1800': {'atrocemente': 3,
  'formidabilmente': 0,
  'mostruosamente': 0,
  'orrendamente': 1,
  'orribilmente': 25,
  'spaventosamente': 1,
  'terribilmente': 13,
  'tremendamente': 0,
  'molto': 16013,
  'angoscia': 124,
  'angosciare': 0,
  'ansia': 2,
  'atterrire': 134,
  'panico': 14,
  'paura': 285,
  'raccapricciare': 2,
  'raccapriccio': 8,
  'schifo': 31,
  'sgomentare': 18,
  'sg

In [19]:
table = pd.DataFrame.from_dict(candidates, orient='index')
table

Unnamed: 0,atrocemente,formidabilmente,mostruosamente,orrendamente,orribilmente,spaventosamente,terribilmente,tremendamente,molto,angoscia,...,spavento,terrore,domani,finora,ieri,oggi,presto,raramente,sempre,spesso
1700-1750,2,0,2,1,20,9,13,0,11215,25,...,391,597,56,444,25,913,509,9,2269,814
1751-1800,3,0,0,1,25,1,13,0,16013,124,...,212,595,189,271,108,714,975,80,3334,1347
1801-1825,2,0,2,10,28,2,16,2,11595,142,...,217,340,363,125,412,1418,670,30,3287,866
1826-1850,25,2,3,11,61,4,33,6,12414,306,...,671,1375,239,339,169,1086,906,47,3464,1487
1876-1900,119,16,36,45,223,81,209,67,24739,1202,...,1199,1931,5289,892,2764,5021,4100,309,9629,3427
1901-1925,59,14,21,33,198,104,169,19,17134,1048,...,657,1343,4313,604,1746,3645,2846,337,6433,2958
1926-1950,37,9,15,27,90,68,176,24,13130,814,...,408,1122,2438,285,1176,2818,1244,110,3943,1113
1951-1975,16,0,6,6,21,26,14,3,9315,116,...,55,151,233,86,309,1170,381,49,2459,783
1976-2000,97,5,78,171,156,140,1045,348,36297,5930,...,1001,7062,8103,7345,11037,12097,7678,3040,13329,8530
2001-2010,147,35,214,215,339,396,1938,1070,26935,2891,...,1661,2944,3719,2831,4511,7014,3242,2609,8348,4090


In [20]:
table.to_csv('../Korpus/Korpus/candidates_after_lemmatization.csv')