In [1]:
import nltk
import string
import pickle
import numpy as np
from collections import Counter

In [2]:
import zipfile
content = {}
with zipfile.ZipFile("keyword.zip") as zfile:
    for f in zfile.namelist():
        if f != "keyword/":
            content[f] = zfile.read(f).decode("utf8")
content.keys()     



dict_keys(['keyword/automobil_50k.txt', 'keyword/news_text100k.txt', 'keyword/sport_50k.txt', 'keyword/wirtschaft_50k.txt'])

In [3]:
def clean(text):
    tokenized_sents=[]
    text = text.lower()
    text = nltk.sent_tokenize(text)
    for sent in text:
        sent = nltk.word_tokenize(sent)
        sent = [x for x in sent if x not in string.punctuation]
        sent = [x for x in sent if x not in ["``", "''", "€", "$"]]
        sent = [x for x in sent if x.isalpha()]
        if len(sent) > 1:
            tokenized_sents.append(sent)
        
    
    return tokenized_sents

In [4]:


for key in content:
    content[key] = clean(content[key])

In [5]:
def create_bigrams(dic_corpora):
    ngram_dict = {}
    #stopwords = nltk.corpus.stopwords.words("german")
    stopwords =["findbecci","--","..", "§", '”', "rebecca", "freeflorian", "\|lbr\|", "|lbr|"]
   
    for index, key in enumerate(dic_corpora):
        print(key)
        ngram_dict[key] =[]
  
        for sents in dic_corpora[key]:

            ngram_dict[key] += [gram for gram in list(nltk.ngrams(sents,2)) if len(set(stopwords).intersection(set(gram)))== 0]
    
    return ngram_dict

In [6]:
bigrams = create_bigrams(content)

keyword/automobil_50k.txt
keyword/news_text100k.txt
keyword/sport_50k.txt
keyword/wirtschaft_50k.txt


In [7]:

def load_obj(name ):
    with open('obj/' + name + '.pkl', 'rb') as f:
        return pickle.load(f)

In [8]:
hatespeech = load_obj("ngrams_hatespeech_with_stopwords")

In [9]:
bigrams["keywords/hatespeech"] = hatespeech["2-grams"]

In [10]:
bigrams["keywords/hatespeech"][:100]

[('rosenmontag', 'ist'),
 ('ist', 'abgesagt'),
 ('rapefugees', 'also'),
 ('also', 'wieder'),
 ('wieder', 'wie'),
 ('wie', 'gewohnt'),
 ('gewohnt', 'ins'),
 ('ins', 'schwimmbad'),
 ('schwimmbad', 'gehen'),
 ('bitte', 'nicht'),
 ('nicht', 'die'),
 ('die', 'türkei'),
 ('türkei', 'zum'),
 ('zum', 'eu-mitglied'),
 ('eu-mitglied', 'machen'),
 ('menschenrechte', 'pressefreiheit'),
 ('pressefreiheit', 'islamisierung'),
 ('islamisierung', 'wieso'),
 ('wieso', 'bekommen'),
 ('bekommen', 'rapefugees'),
 ('rapefugees', 'mehr'),
 ('mehr', 'als'),
 ('als', 'unsere'),
 ('unsere', 'hartz'),
 ('hartz', 'enpfänger'),
 ('esreicht', 'den'),
 ('den', 'verfluchten'),
 ('verfluchten', 'rapefugees'),
 ('rapefugees', 'den'),
 ('den', 'krieg'),
 ('krieg', 'erklären'),
 ('wasfehlt', 'war'),
 ('war', 'das'),
 ('das', 'wochenende'),
 ('wochenende', 'im'),
 ('im', 'ruhrpott'),
 ('ruhrpott', 'unterwegs'),
 ('überall', 'schwangere'),
 ('schwangere', 'muslimische'),
 ('muslimische', 'frauen'),
 ('frauen', 'mit'),
 ('m

In [11]:
def transform_ngrams(dic):
    bigrams={}
    for keys in dic:
        tuples = []
        for doc in dic[keys]:
            tuples.append("_".join(doc))
        bigrams[keys] = tuples
    return bigrams

In [12]:
bigram = transform_ngrams(bigrams)

In [13]:
bigram

{'keyword/automobil_50k.txt': ['der_soll',
  'soll_der',
  'der_konkurrenz',
  'konkurrenz_von',
  'von_skoda',
  'skoda_und',
  'und_ford',
  'ford_ordentlich',
  'ordentlich_einheizen',
  'mit_dem',
  'dem_bidirektionalen',
  'bidirektionalen_schlüssel',
  'schlüssel_kann',
  'kann_je',
  'je_nach',
  'nach_system',
  'system_auch',
  'auch_geprüft',
  'geprüft_werden',
  'werden_ob',
  'ob_das',
  'das_licht',
  'licht_noch',
  'noch_brennt',
  'brennt_oder',
  'oder_das',
  'das_radio',
  'radio_noch',
  'noch_spielt',
  'auch_monatlich',
  'monatlich_günstiger',
  'günstiger_die',
  'die_niedrigeren',
  'niedrigeren_anschaffungspreise',
  'anschaffungspreise_der',
  'der_kastenwagen',
  'kastenwagen_wirken',
  'wirken_sich',
  'sich_auf',
  'auf_die',
  'die_monatlichen',
  'monatlichen_kosten',
  'kosten_dämpfend',
  'dämpfend_aus',
  'bei_aller',
  'aller_liebe',
  'liebe_zum',
  'zum_detail',
  'detail_haben',
  'haben_die',
  'die_entwickler',
  'entwickler_dennoch',
  'dennoc

In [14]:
vocab = nltk.FreqDist(sum(bigram.values(),[]))

In [15]:
vocab = sorted(list(vocab.items()),key= lambda x: x[1], reverse=True) 

In [16]:
vocab = [x[0] for x in vocab]

In [17]:
vocab[:100]

['in_der',
 'in_den',
 'für_die',
 'mit_dem',
 'auf_den',
 'bei_der',
 'in_die',
 'für_den',
 'auf_die',
 'und_die',
 'auf_der',
 'mit_einem',
 'mit_der',
 'auf_dem',
 'an_der',
 'von_der',
 'bei_den',
 'aus_dem',
 'sich_die',
 'auch_die',
 'und_der',
 'mehr_als',
 'nach_dem',
 'gibt_es',
 'dass_die',
 'ist_der',
 'nicht_mehr',
 'das_ist',
 'in_deutschland',
 'vor_allem',
 'über_die',
 'an_den',
 'ist_die',
 'um_die',
 'für_das',
 'aus_der',
 'mit_den',
 'vor_dem',
 'an_die',
 'in_einem',
 'nicht_nur',
 'es_ist',
 'ist_das',
 'gegen_die',
 'nach_der',
 'mit_einer',
 'durch_die',
 'gegen_den',
 'sich_der',
 'ist_ein',
 'und_das',
 'auch_in',
 'hat_sich',
 'in_einer',
 'in_diesem',
 'zu_den',
 'um_prozent',
 'millionen_euro',
 'sind_die',
 'von_den',
 'noch_nicht',
 'bis_zu',
 'aber_auch',
 'auch_der',
 'dass_der',
 'hat_die',
 'wenn_sie',
 'milliarden_euro',
 'ist_es',
 'wie_die',
 'und_den',
 'der_deutschen',
 'den_usa',
 'sich_in',
 'vor_der',
 'nur_noch',
 'um_den',
 'prozent_auf',
 

In [18]:
term_freq = {genre: nltk.FreqDist(text) for genre, text in bigram.items()}

In [19]:
term_freq

{'keyword/automobil_50k.txt': FreqDist({'in_der': 2134, 'in_den': 1487, 'mit_dem': 1478, 'für_die': 1275, 'auf_der': 1198, 'für_den': 1048, 'gibt_es': 972, 'auf_den': 962, 'ist_der': 936, 'auf_dem': 895, ...}),
 'keyword/news_text100k.txt': FreqDist({'in_der': 2692, 'in_den': 1382, 'für_die': 1299, 'bei_der': 875, 'mit_dem': 839, 'auf_die': 810, 'und_die': 733, 'auf_der': 731, 'in_die': 696, 'an_der': 672, ...}),
 'keyword/sport_50k.txt': FreqDist({'in_der': 4728, 'für_die': 1626, 'in_den': 1156, 'in_die': 1097, 'mit_dem': 1091, 'auf_den': 1086, 'nach_dem': 1052, 'bei_der': 899, 'für_den': 887, 'gegen_den': 857, ...}),
 'keyword/wirtschaft_50k.txt': FreqDist({'in_der': 2147, 'in_den': 1934, 'für_die': 1730, 'bei_der': 915, 'um_prozent': 868, 'und_die': 855, 'milliarden_euro': 831, 'dass_die': 798, 'auf_den': 775, 'mit_der': 768, ...}),
 'keywords/hatespeech': FreqDist({'in_der': 566, 'das_ist': 432, 'in_den': 339, 'in_deutschland': 327, 'und_die': 312, 'nicht_mehr': 306, 'für_die': 296

In [20]:
def create_dtm(dictionary,vocab):
    matrix=[]

    for value in dictionary.values():
        array=[]
        for word in vocab: 
            array.append(value.get(word, 0))
        matrix.append(array)    
    return np.array(matrix)      
           

In [21]:

def tfidf(matrix, terms, voc,n):
    
    
    y=matrix.sum(axis=1)
    
    
    y=y[:,np.newaxis]
    tm= (np.divide(matrix,y)) * np.log(matrix.shape[0]/((matrix>0).sum(0)))
   
    idf_list=[]
   
    for i in range(tm.shape[0]):
        
        idf_list.append(tm[i].argsort()[::-1])
        
    docs={}
    doc_list=[]
    
 
    for index, key in enumerate(terms.keys()):
        for i in range(n):
            doc_list.append(voc[idf_list[index][i]])
        
        docs[key]= doc_list
        doc_list=[]
    return docs

In [22]:
dtm=create_dtm(term_freq,vocab)

In [23]:

results = tfidf(dtm, term_freq, vocab,1000)


In [24]:
result_list= results["keywords/hatespeech"]



In [25]:
for index, value in enumerate(result_list):
    result_list[index]= " ".join(value.split("_"))

In [26]:
result_list

['die afd',
 'der islam',
 'der afd',
 'bla bla',
 'die flüchtlinge',
 'die asylanten',
 'le pen',
 'den islam',
 'die fresse',
 'dem islam',
 "geht 's",
 'den arsch',
 'afd wählen',
 'pädophile und',
 'die islamisierung',
 'die gutmenschen',
 'zum kotzen',
 'ach du',
 'schmarotzer pädophile',
 'im erika',
 'islam ist',
 'frau petry',
 'muss weg',
 'für flüchtlinge',
 'die moslems',
 'die lügenpresse',
 "gibt 's",
 'partei deutschlands',
 'und denunzianten',
 'unsere politiker',
 'den flüchtlingen',
 'asylanten in',
 'den knast',
 'volk und',
 'unsere regierung',
 'islam gehört',
 'spd schmarotzer',
 'ein nazi',
 'flüchtlinge sind',
 'armes deutschland',
 'die merkel',
 'den asylanten',
 'dem pack',
 'lügenpresse und',
 'von pegida',
 'die muslime',
 'der abgefuckten',
 'sein volk',
 'die grünen',
 'das volk',
 'eigene volk',
 'die uno',
 'für asylanten',
 'die scharia',
 'schaut euch',
 'marine le',
 'das pack',
 'flüchtlinge nicht',
 'gt am',
 'wollte bescheid',
 'frau le',
 'fordern

In [27]:
np.savetxt("bigram_keywords_tf_idf_with_news_with_stopwords.csv", result_list, delimiter =",",fmt ='% s')