In [1]:
import glob
import regex as re
from collections import Counter

import pandas as pd

from scipy.stats import zscore
from scipy.spatial import distance

import nltk
from nltk.corpus import stopwords

from sklearn.feature_extraction.text import CountVectorizer

class Zscores():
    def __init__(self, data):
        self.data = data

    def remove_stopwords(self):
        stopword = open("de_stopwords.txt")
        stopwords = stopword.read()
        self.data['text'] = [str(i).lower() for i in self.data['text']]
        self.data['removedstopword'] = self.data['text'].apply(lambda x: ' '.join([item for item in str(x).split() if item not in stopwords]))
        return self.data


    def count_frequencies(self, df):
        freq_list = []
        for i, row in df.iterrows():
            title = str(row.ID)
            vocab = Counter(row.removedstopword.split())
            frequencies = list(vocab.values())
            words = list(vocab.keys())
            freq_list.append(pd.Series(frequencies, words, name=title))
        return freq_list

    def calculate_zscores(self):
        df = self.remove_stopwords()
        freq_list = self.count_frequencies(df)
        counts = pd.DataFrame(freq_list)
        counts = counts.fillna(0)
        counts = counts.div(counts.sum(axis=1), axis=0)
        counts.loc['Total_per_word'] = counts.sum()
        counts = counts.sort_values(by='Total_per_word', axis=1, ascending=False)
        counts.drop('Total_per_word', inplace=True, axis=0)
#         print(counts)
        zscores = counts.apply(zscore)
#         print(zscores)

        zscores.drop(zscores.columns[1000:], inplace=True, axis=1)

        return counts, zscores


In [5]:
poems = pd.read_csv('../corpus/0_csv_fuer_delta/fontane_corpus_gesamt.csv')
z = Zscores(poems)
counts, zscores = z.calculate_zscores()
# zscores.to_csv('../results/delta/zscores_fontane.csv')
counts



                                                      in+die      herz  \
autoren_lyrik_lemma_Fontane_Die zwei Raben          0.000000  0.000000   
autoren_lyrik_lemma_Fontane_2. Seydlitz und der...  0.000000  0.000000   
autoren_lyrik_lemma_fontane_4. Es kann die Ehre...  0.000000  0.000000   
autoren_lyrik_lemma_fontane_5. [O, wie weit, wi...  0.025641  0.128205   
autoren_lyrik_lemma_fontane_König Karl der Zwei...  0.000000  0.000000   
...                                                      ...       ...   
autoren_lyrik_lemma_fontane_Religion                0.023923  0.009569   
autoren_lyrik_lemma_Fontane_Prolog                  0.000000  0.000000   
autoren_lyrik_lemma_Fontane_4. Re Umbertos Kranz    0.038462  0.000000   
autoren_lyrik_lemma_fontane_Hastingsfeld            0.031746  0.000000   
autoren_lyrik_lemma_fontane_Kaiser Wilhelms Rüc...  0.000000  0.000000   

                                                      zu+die    an+die  \
autoren_lyrik_lemma_Fontane_Die zwei 

                                                    in+die  herz  zu+die  \
autoren_lyrik_lemma_Fontane_Die zwei Raben             NaN   NaN     NaN   
autoren_lyrik_lemma_Fontane_2. Seydlitz und der...     NaN   NaN     NaN   
autoren_lyrik_lemma_fontane_4. Es kann die Ehre...     NaN   NaN     NaN   
autoren_lyrik_lemma_fontane_5. [O, wie weit, wi...     NaN   NaN     NaN   
autoren_lyrik_lemma_fontane_König Karl der Zwei...     NaN   NaN     NaN   
...                                                    ...   ...     ...   
autoren_lyrik_lemma_fontane_Religion                   NaN   NaN     NaN   
autoren_lyrik_lemma_Fontane_Prolog                     NaN   NaN     NaN   
autoren_lyrik_lemma_Fontane_4. Re Umbertos Kranz       NaN   NaN     NaN   
autoren_lyrik_lemma_fontane_Hastingsfeld               NaN   NaN     NaN   
autoren_lyrik_lemma_fontane_Kaiser Wilhelms Rüc...     NaN   NaN     NaN   

                                                    an+die  alt  stehen  \
autoren_lyri

Unnamed: 0,in+die,herz,zu+die,an+die,alt,stehen,liegen,still,glück,kind,...,knöchel,treulich,ausgehöhlt,baumstamm,felsig,phöniciens,great,eastern,eisengeschuppten,mephisto
autoren_lyrik_lemma_Fontane_Die zwei Raben,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.0,0.000000,0.000000,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
autoren_lyrik_lemma_Fontane_2. Seydlitz und der Bürgermeister von Ohlau,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.0,0.000000,0.000000,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
autoren_lyrik_lemma_fontane_4. Es kann die Ehre dieser Welt,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.0,0.000000,0.000000,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
"autoren_lyrik_lemma_fontane_5. [O, wie weit, wie weit]",0.025641,0.128205,0.000000,0.000000,0.000000,0.000000,0.025641,0.0,0.025641,0.000000,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
autoren_lyrik_lemma_fontane_König Karl der Zweite von Engelland,0.000000,0.000000,0.000000,0.000000,0.000000,0.029412,0.000000,0.0,0.000000,0.000000,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
autoren_lyrik_lemma_fontane_Religion,0.023923,0.009569,0.009569,0.000000,0.004785,0.009569,0.004785,0.0,0.000000,0.004785,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
autoren_lyrik_lemma_Fontane_Prolog,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.0,0.000000,0.000000,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
autoren_lyrik_lemma_Fontane_4. Re Umbertos Kranz,0.038462,0.000000,0.038462,0.000000,0.038462,0.000000,0.000000,0.0,0.000000,0.000000,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
autoren_lyrik_lemma_fontane_Hastingsfeld,0.031746,0.000000,0.000000,0.015873,0.000000,0.000000,0.015873,0.0,0.000000,0.000000,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [20]:
z = counts - counts.mean()/counts.std(ddof=0)
z

Unnamed: 0,in+die,herz,zu+die,an+die,alt,stehen,liegen,still,glück,kind,...,knöchel,treulich,ausgehöhlt,baumstamm,felsig,phöniciens,great,eastern,eisengeschuppten,mephisto
autoren_lyrik_lemma_Fontane_Die zwei Raben,-0.573394,-0.375032,-0.357853,-0.297434,-0.334467,-0.207769,-0.261299,-0.24733,-0.193622,-0.202280,...,-0.03732,-0.03732,-0.03732,-0.03732,-0.03732,-0.03732,-0.03732,-0.03732,-0.03732,-0.03732
autoren_lyrik_lemma_Fontane_2. Seydlitz und der Bürgermeister von Ohlau,-0.573394,-0.375032,-0.357853,-0.297434,-0.334467,-0.207769,-0.261299,-0.24733,-0.193622,-0.202280,...,-0.03732,-0.03732,-0.03732,-0.03732,-0.03732,-0.03732,-0.03732,-0.03732,-0.03732,-0.03732
autoren_lyrik_lemma_fontane_4. Es kann die Ehre dieser Welt,-0.573394,-0.375032,-0.357853,-0.297434,-0.334467,-0.207769,-0.261299,-0.24733,-0.193622,-0.202280,...,-0.03732,-0.03732,-0.03732,-0.03732,-0.03732,-0.03732,-0.03732,-0.03732,-0.03732,-0.03732
"autoren_lyrik_lemma_fontane_5. [O, wie weit, wie weit]",-0.547753,-0.246826,-0.357853,-0.297434,-0.334467,-0.207769,-0.235658,-0.24733,-0.167981,-0.202280,...,-0.03732,-0.03732,-0.03732,-0.03732,-0.03732,-0.03732,-0.03732,-0.03732,-0.03732,-0.03732
autoren_lyrik_lemma_fontane_König Karl der Zweite von Engelland,-0.573394,-0.375032,-0.357853,-0.297434,-0.334467,-0.178357,-0.261299,-0.24733,-0.193622,-0.202280,...,-0.03732,-0.03732,-0.03732,-0.03732,-0.03732,-0.03732,-0.03732,-0.03732,-0.03732,-0.03732
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
autoren_lyrik_lemma_fontane_Religion,-0.549471,-0.365462,-0.348284,-0.297434,-0.329682,-0.198200,-0.256514,-0.24733,-0.193622,-0.197495,...,-0.03732,-0.03732,-0.03732,-0.03732,-0.03732,-0.03732,-0.03732,-0.03732,-0.03732,-0.03732
autoren_lyrik_lemma_Fontane_Prolog,-0.573394,-0.375032,-0.357853,-0.297434,-0.334467,-0.207769,-0.261299,-0.24733,-0.193622,-0.202280,...,-0.03732,-0.03732,-0.03732,-0.03732,-0.03732,-0.03732,-0.03732,-0.03732,-0.03732,-0.03732
autoren_lyrik_lemma_Fontane_4. Re Umbertos Kranz,-0.534933,-0.375032,-0.319391,-0.297434,-0.296005,-0.207769,-0.261299,-0.24733,-0.193622,-0.202280,...,-0.03732,-0.03732,-0.03732,-0.03732,-0.03732,-0.03732,-0.03732,-0.03732,-0.03732,-0.03732
autoren_lyrik_lemma_fontane_Hastingsfeld,-0.541648,-0.375032,-0.357853,-0.281561,-0.334467,-0.207769,-0.245426,-0.24733,-0.193622,-0.202280,...,-0.03732,-0.03732,-0.03732,-0.03732,-0.03732,-0.03732,-0.03732,-0.03732,-0.03732,-0.03732


In [2]:
poems = pd.read_csv('../corpus/0_csv_fuer_delta/hauptcorpus_gesamt.csv')
z = Zscores(poems)
counts, zscores = z.calculate_zscores()
# zscores.to_csv('../results/delta/zscores_fontane.csv')

In [3]:
counts

Unnamed: 0,in+die,herz,zu+die,an+die,nacht,still,hand,von+die,tief,auge,...,seďfs,schiffbruches,schrankartiger,deghri,patronenkapsel,ketschikise,kettschuwal,ziegenfell,haarseite,herübergeklungen
lyrik_lemma_l00100002,0.058824,0.000000,0.019608,0.000000,0.000000,0.000000,0.000000,0.019608,0.000000,0.000000,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
lyrik_lemma_l00100003,0.000000,0.045455,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
lyrik_lemma_l00100004,0.058824,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.019608,0.000000,0.000000,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
lyrik_lemma_l00100005,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.022727,0.000000,0.022727,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
lyrik_lemma_l00100006,0.047619,0.023810,0.000000,0.000000,0.000000,0.023810,0.000000,0.000000,0.000000,0.000000,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
epik_lemma_lewald_clementine,0.006754,0.009583,0.004928,0.002921,0.001643,0.001186,0.003012,0.000730,0.002464,0.003833,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
epik_lemma_falke_durchschnitt,0.011482,0.001914,0.008190,0.005971,0.000153,0.000306,0.002603,0.002067,0.001684,0.002679,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
epik_lemma_heiberg_frau,0.008912,0.001936,0.004681,0.002521,0.000900,0.000540,0.005041,0.000990,0.001575,0.004276,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
epik_lemma_meyer_richterin,0.009249,0.002563,0.004012,0.004792,0.000891,0.001003,0.007689,0.001449,0.001560,0.005683,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [3]:
z = counts - counts.mean()/counts.std(ddof=0)
z

MemoryError: Unable to allocate 2.34 GiB for an array with shape (4750, 66084) and data type float64