In [10]:
import nltk
import numpy as np
import zipfile
import spacy
import string
nlp = spacy.load('de_core_news_md',disable = ['parser','ner'])
nlp.max_length = 3100000

stopwords = nltk.corpus.stopwords.words('german')
punctuation = string.punctuation+"’``''‘..."



In [11]:
content = {}
with zipfile.ZipFile("keywords.zip") as zfile:
    for f in zfile.namelist():
        if f != "keyword/":
            content[f] = zfile.read(f).decode("utf8")
content.keys()      

dict_keys(['keyword/brd_sentences.txt', 'keyword/ddr_sentences.txt', 'keyword/news.txt'])

In [12]:
def preprocess(txt):
    doc = nlp(txt)
    words = [x.lemma_ if not x.is_punct else str(x) for x in doc]
    words = [word.lower() for word in words]
    words = [word for word in words if word not in punctuation]
    words = [word for word in words if len(word)>2]
    words = [word for word in words if word not in stopwords]
    words = [word for word in words if not word.isdigit()]

    return words

In [13]:
class TFIDF:
    def create_dtm(self, texts, cut_first=1, min_freq=3):
        
        # Clean texts
        self.texts = {k:preprocess(v) for k,v in texts.items()}
        
        # Count texts
        self.vocab = nltk.FreqDist(sum(self.texts.values(),[]))
        
        # Prune overall vocabulary
        self.vocab = sorted(list(self.vocab.items()),key= lambda x: -x[1]) 
        self.vocab = [x[0] for x in self.vocab[cut_first:] if x[1] >=min_freq]
        
        # count and restrict domain level text to the vocabulary
        self.term_frequencies = {genre: nltk.FreqDist(text) for genre, text in self.texts.items()}
        self.dtm = np.array([[v.get(w,0) for w in self.vocab] for k,v in self.term_frequencies.items()])
    
    def tfidf(self):
        tf = np.log(self.dtm + 1e-25)
        #tf = 0.5 + 0.5 * self.dtm / self.dtm.max(-1, keepdims=True) # alternative normalization.
        idf = np.log(self.dtm.shape[0] /((da.dtm>0).sum(0) + 1e-25))
        return tf * idf

    def tfidf_keywords(self, n=10):
        """Iterate all copora for printing"""
        tfidf = self.tfidf()
        return {k:[self.vocab[k] for k in tfidf[i].argsort(0)[::-1][:n]] for i,k in enumerate(self.texts.keys())}
    


In [14]:
da = TFIDF()
da.create_dtm(content)

In [15]:
da.tfidf_keywords(n=25)

{'keyword/brd_sentences.txt': ['bum',
  'tuut',
  'dub',
  'amadeus',
  'hadschi',
  'annabelle',
  'goodbye',
  'tarzan',
  'fütter',
  'adio',
  'knutschfleck',
  'morgana',
  'fata',
  'traumboy',
  'nana-nana',
  'mamy',
  'gianna',
  'dudub',
  'verdamp',
  'golle',
  'willem',
  'sha',
  'holla',
  'holadie',
  'pan'],
 'keyword/ddr_sentences.txt': ["seh'n",
  "geh'n",
  "steh'n",
  "uns're",
  "ander'n",
  'kurschatt',
  'elfenbein',
  "versteh'n",
  'gradaus',
  'traumzeit',
  'superfrau',
  'drache',
  "and're",
  'immerfort',
  'nebelmeer',
  "wenn's",
  "geseh'n",
  "zieh'n",
  'weltenmeer',
  "bist'n",
  "über's",
  'lebensroulette',
  'susan',
  'gojko',
  'arbeitsschluss'],
 'keyword/news.txt': ['euro',
  'unternehmen',
  'trump',
  'team',
  'politisch',
  'mehrere',
  'regierung',
  'pro',
  'europäisch',
  'aktie',
  'zunächst',
  'ehemalig',
  'angabe',
  'saison',
  'betonen',
  'bislang',
  'entsprechend',
  'zufolge',
  'schweiz',
  'mitarbeiter',
  'unterstützen',