In [190]:
import pandas as pd
import numpy as np
from HanTa import HanoverTagger as ht
from tqdm.auto import tqdm
import top2vec
from gensim.models.doc2vec import Doc2Vec, TaggedDocument
import gensim
import pickle
import mgzip
from nltk.tokenize import sent_tokenize
from pandarallel import pandarallel
from sklearn.ensemble import RandomForestRegressor
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import RandomizedSearchCV
from sklearn.model_selection import train_test_split
import pyreadr

#pandarallel.initialize(progress_bar=True)
tqdm.pandas()

# Feature engineering

In [2]:
# load command
with mgzip.open('../data/combined_news_pre.mgzip', 'rb') as handle:
    combined_news_pre = pickle.load(handle)

In [3]:
combined_news_pre

Unnamed: 0,title,date,combined_text,newspaper,text_token,combined_text_joined
0,"Handel: ""Viele brauchen ja eigentlich nichts m...",2021-11-30 06:07:00,"ZEIT: Herr Weber, Herr Rauschen, Herr Greiner,...",zeit,"[herr, weber, herr, rauschen, herr, greiner, w...",herr weber herr rauschen herr greiner weihnach...
1,DGB-Index Gute Arbeit: An den Kosten im Homeof...,2021-11-30 05:50:00,Wie gut sind die Arbeitsbedingungen in Deutsch...,zeit,"[arbeitsbedingung, deutschland, beginn, corona...",arbeitsbedingung deutschland beginn coronapand...
2,Coronavirus: RKI registriert erstmals wieder l...,2021-11-30 04:23:00,Die Zahl der Neuinfektionen pro 100.000 \nEinw...,zeit,"[zahl, neuinfektion, 100000, einwohner, woche,...",zahl neuinfektion 100000 einwohner woche angab...
3,Bund-Länder-Runde: Kanzleramtschef fordert Cor...,2021-11-30 02:01:00,Vor der Bund-Länder-Schalte zur Corona-Krise\n...,zeit,"[bundländerschalte, coronakrise, pochen, gesch...",bundländerschalte coronakrise pochen geschäfts...
4,Corona-Impfung für Kinder: Gesundheitsminister...,2021-11-29 19:22:00,Die Gesundheitsministerinnen und -minister der...,zeit,"[gesundheitsministerin, minister, land, corona...",gesundheitsministerin minister land coronaimpf...
...,...,...,...,...,...,...
12435,Virus aus Wuhan in Japan festgestellt,2020-01-16 18:17:00,Nach mehr als 40 Infektionen mit einem neuarti...,tagespiegel,"[40, infektion, neuartig, coronavirus, chinesi...",40 infektion neuartig coronavirus chinesisch w...
12436,"Erster Todesfall durch neues, Lungenkrankheit ...",2020-01-12 11:51:00,Erstmals ist ein Patient an der geheimnisvolle...,tagespiegel,"[erstmals, patient, geheimnisvoll, lungenkrank...",erstmals patient geheimnisvoll lungenkrankheit...
12437,Neues Virus ist wahrscheinlich Ursache von Lun...,2020-01-09 13:38:00,Die Ausbreitung einer zuvor unbekannten Lungen...,tagespiegel,"[ausbreitung, zuvor, unbekannt, lungenkrankhei...",ausbreitung zuvor unbekannt lungenkrankheit ze...
12438,Rätselraten über ein Virus,2020-01-07 17:24:00,Nachrichten aus der zentralchinesischen Provin...,tagespiegel,"[nachricht, zentralchinesisch, provinz, hubei,...",nachricht zentralchinesisch provinz hubei weck...


## Average sentence length

In [4]:
def avg_sent_len(input):
    sent_tokenize_list = sent_tokenize(input)
    return sum(len(x.split()) for x in sent_tokenize_list) / len(sent_tokenize_list)

In [5]:
def avg_sent(input):
    sent_tokenize_list = sent_tokenize(input)
    return len(sent_tokenize_list)

In [6]:
combined_news_pre['avg_length_sent'] = combined_news_pre['combined_text'].progress_apply(avg_sent)

  0%|          | 0/28432 [00:00<?, ?it/s]

In [7]:
combined_news_pre['avg_length'] = combined_news_pre['combined_text'].progress_apply(avg_sent_len)

  0%|          | 0/28432 [00:00<?, ?it/s]

## Presence of adjectives

In [12]:
tagger = ht.HanoverTagger('morphmodel_ger.pgz')

In [13]:
def pos_count(input):
    keywords_adj = ['ADJA', 'ADJD', 'ADV']
    keywords_noun = ['NN']
    count_adj = 0
    count_nn = 0
    count_total = 0
    for word in input:
        count_total += 1
        tag = tagger.analyze(word)[1]
        if tag in keywords_adj:
            count_adj += 1
        if tag in keywords_noun:
            count_nn += 1
    if count_total == 0:
        count_total = 1
    return count_adj, count_nn, count_adj/count_total, count_nn/count_total

In [14]:
combined_news_pre['pos_count'] = combined_news_pre['text_token'].parallel_apply(pos_count)

VBox(children=(HBox(children=(IntProgress(value=0, description='0.00%', max=3554), Label(value='0 / 3554'))), …

In [15]:
combined_news_pre[['count_adj', 'count_nn', 'percent_adj', 'percent_nn']] = pd.DataFrame(combined_news_pre['pos_count'].tolist(), index=combined_news_pre.index)

In [16]:
combined_news_pre.drop('pos_count', axis=1, inplace=True)

## Mention of scientist

In [17]:
def scientist_count(input):
    keywords = ['drosten', 'streeck', 'lauterbach', 'kekulé', 'schmidt-chanasit', 'wodarg']
    scientist = []
    for word in input:
        if word in keywords:
            scientist.append(word)
    return scientist

In [18]:
df_scientists = combined_news_pre['text_token'].apply(scientist_count).str.join('|').str.get_dummies()

In [19]:
combined_news_pre = pd.concat([combined_news_pre, df_scientists], axis=1)

## create doc2vecs

In [35]:
tagged_data = [TaggedDocument(words=_d, tags=[str(i)]) for i, _d in enumerate(combined_news_pre['text_token'])]

In [38]:
model = gensim.models.doc2vec.Doc2Vec(vector_size=100, min_count=2, epochs=40, dm=1)

In [42]:
model.build_vocab(tagged_data)

In [43]:
model.train(tagged_data, total_examples=model.corpus_count, epochs=model.epochs)

In [77]:
doc_ids = []
for doc_id in range(len(combined_news_pre)):
    doc_ids.append(list(model.dv[doc_id]))
combined_news_pre['docvecs'] = doc_ids

In [79]:
combined_news_pre[["vec" + str(int(x)) for x in np.linspace(start=0, stop=99, num=100)]] = combined_news_pre.docvecs.apply(pd.Series)

  combined_news_pre[["vec" + str(int(x)) for x in np.linspace(start=0, stop=99, num=100)]] = combined_news_pre.docvecs.apply(pd.Series)
  combined_news_pre[["vec" + str(int(x)) for x in np.linspace(start=0, stop=99, num=100)]] = combined_news_pre.docvecs.apply(pd.Series)
  combined_news_pre[["vec" + str(int(x)) for x in np.linspace(start=0, stop=99, num=100)]] = combined_news_pre.docvecs.apply(pd.Series)
  combined_news_pre[["vec" + str(int(x)) for x in np.linspace(start=0, stop=99, num=100)]] = combined_news_pre.docvecs.apply(pd.Series)


In [81]:
combined_news_pre.drop('docvecs', inplace=True, axis=1)

## Load topic modeling

In [106]:
# load precomputed model (learn_ngram OR learn_ngram_full), full uses original text, other one uses preprocessed texts
model = top2vec.Top2Vec.load('../models/topic2vec_learn_ngram.t2v')

In [107]:
model.hierarchical_topic_reduction(17)

[[105,
  19,
  126,
  44,
  111,
  158,
  265,
  124,
  185,
  213,
  132,
  244,
  161,
  57,
  72,
  59,
  245,
  187,
  148,
  228,
  246,
  109,
  171,
  117,
  137,
  62,
  130,
  173,
  26,
  206,
  192,
  217,
  266,
  135,
  210,
  144,
  67,
  220,
  69,
  46,
  227,
  81,
  190,
  254,
  243,
  83],
 [271,
  248,
  48,
  32,
  114,
  128,
  65,
  193,
  208,
  56,
  77,
  212,
  154,
  87,
  31,
  151,
  129,
  14,
  222,
  39,
  103,
  168,
  66,
  252,
  28,
  5],
 [202,
  267,
  189,
  269,
  85,
  272,
  209,
  17,
  200,
  257,
  255,
  104,
  95,
  123,
  195,
  249,
  99,
  37,
  116,
  153,
  13,
  34,
  188,
  152,
  175,
  247,
  40,
  15,
  233,
  268,
  42,
  30],
 [73,
  164,
  237,
  207,
  184,
  51,
  78,
  122,
  54,
  260,
  12,
  138,
  97,
  18,
  263,
  91,
  90,
  53,
  98,
  25,
  127,
  1],
 [218,
  22,
  182,
  236,
  112,
  75,
  234,
  160,
  259,
  251,
  230,
  176,
  139,
  258,
  8,
  199,
  60,
  55,
  93,
  88,
  134,
  147,
  68],
 [215, 145,

In [108]:
topic_nums, topic_score, topic_words, word_scores = model.get_documents_topics(list(combined_news_pre.index), reduced=True, num_topics=2)

In [114]:
combined_news_pre[['top2vec_0', 'top2vec_1']] = topic_nums

# Load sentiment predictions

In [159]:
# load command
with mgzip.open('../data/combined_comments_group.mgzip', 'rb') as handle:
    combined_comments_group = pickle.load(handle)

In [160]:
predict = pd.DataFrame()

In [161]:
predict = combined_news_pre

In [162]:
predict = predict.merge(combined_comments_group[['title', 'lexi_score', 'bert_score', 'vote_score', 'bert_label', 'count']], how = 'left', on = 'title')

In [163]:
predict.drop(['title','date', 'combined_text', 'text_token', 'combined_text_joined'], inplace=True, axis=1)

In [164]:
predict[['tagespiegel', 'welt', 'zeit']] = pd.get_dummies(predict['newspaper'])

In [165]:
predict.drop('newspaper', inplace=True, axis=1)

In [166]:
predict[predict['bert_score'].notna()]

Unnamed: 0,avg_length_sent,avg_length,count_adj,count_nn,percent_adj,percent_nn,drosten,kekulé,lauterbach,streeck,...,top2vec_0,top2vec_1,lexi_score,bert_score,vote_score,bert_label,count,tagespiegel,welt,zeit
0,132,12.931818,99,308,0.159420,0.495974,0,0,0,0,...,11,7,0.159247,-0.534319,0.192604,negative,16.0,0,0,1
1,45,17.200000,57,177,0.182692,0.567308,0,0,0,0,...,1,7,0.210111,-0.144371,0.338854,negative,26.0,0,0,1
2,47,14.638298,62,136,0.185075,0.405970,0,0,0,0,...,16,15,0.119507,-0.370915,0.148937,neutral,80.0,0,0,1
3,27,17.703704,37,118,0.171296,0.546296,0,0,0,0,...,3,13,0.099042,-0.469456,0.107543,negative,89.0,0,0,1
4,23,17.000000,25,96,0.154321,0.592593,0,0,0,0,...,6,15,0.097489,-0.436324,0.097489,neutral,37.0,0,0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
28446,38,17.315789,39,110,0.170306,0.480349,0,0,0,0,...,1,11,0.129952,-0.992965,0.129952,negative,1.0,1,0,0
28447,40,13.325000,46,133,0.183267,0.529880,0,0,0,0,...,10,8,0.021458,0.045299,0.045299,neutral,1.0,1,0,0
28448,49,15.693878,50,175,0.145349,0.508721,0,0,0,0,...,6,12,0.130321,-0.059480,0.130321,neutral,2.0,1,0,0
28451,39,15.769231,46,148,0.159722,0.513889,0,0,0,0,...,14,3,0.067709,-0.893052,0.067709,negative,1.0,1,0,0


In [167]:
with mgzip.open("../data/predict_raw.mgzip", 'wb') as f:
    pickle.dump(predict, f)

In [168]:
with mgzip.open("../data/predict_na.mgzip", 'wb') as f:
    pickle.dump(predict[predict['bert_score'].notna()], f)

In [169]:
predict.drop(['lexi_score', 'vote_score'], inplace=True, axis=1)

In [170]:
with mgzip.open("../data/predict_bert_topic.mgzip", 'wb') as f:
    pickle.dump(predict[predict['bert_score'].notna()], f)

In [171]:
predict.drop(['top2vec_0', 'top2vec_1'], inplace=True, axis=1)

In [172]:
with mgzip.open("../data/predict_bert_notopic.mgzip", 'wb') as f:
    pickle.dump(predict[predict['bert_score'].notna()], f)

# load the version we like to explore

In [182]:
with mgzip.open('../data/predict_bert_topic.mgzip', 'rb') as handle:
    predict = pickle.load(handle)

In [183]:
X = predict[predict['bert_score'].notna()]

In [184]:
y = X['bert_label']

In [185]:
X.drop(['bert_score', 'bert_label'], axis=1, inplace=True)

In [186]:
X

Unnamed: 0,avg_length_sent,avg_length,count_adj,count_nn,percent_adj,percent_nn,drosten,kekulé,lauterbach,streeck,...,vec96,vec97,vec98,vec99,top2vec_0,top2vec_1,count,tagespiegel,welt,zeit
0,132,12.931818,99,308,0.159420,0.495974,0,0,0,0,...,0.223306,1.547494,-0.364949,0.230211,11,7,16.0,0,0,1
1,45,17.200000,57,177,0.182692,0.567308,0,0,0,0,...,-1.238253,-0.023410,4.192199,-1.047301,1,7,26.0,0,0,1
2,47,14.638298,62,136,0.185075,0.405970,0,0,0,0,...,-0.529035,0.767990,1.726258,1.916841,16,15,80.0,0,0,1
3,27,17.703704,37,118,0.171296,0.546296,0,0,0,0,...,-0.049648,0.052805,2.081748,2.100051,3,13,89.0,0,0,1
4,23,17.000000,25,96,0.154321,0.592593,0,0,0,0,...,-0.616699,-1.511315,1.762218,1.740974,6,15,37.0,0,0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
28446,38,17.315789,39,110,0.170306,0.480349,0,0,0,0,...,-0.417176,0.415955,0.327358,-0.064108,1,11,1.0,1,0,0
28447,40,13.325000,46,133,0.183267,0.529880,0,0,0,0,...,-0.862138,0.045821,1.401799,-1.091150,10,8,1.0,1,0,0
28448,49,15.693878,50,175,0.145349,0.508721,0,0,0,0,...,-0.164128,0.295489,1.617298,-1.044370,6,12,2.0,1,0,0
28451,39,15.769231,46,148,0.159722,0.513889,0,0,0,0,...,-1.761315,-0.497971,-0.103931,-1.272163,14,3,1.0,1,0,0


In [187]:
y

0        negative
1        negative
2         neutral
3        negative
4         neutral
           ...   
28446    negative
28447     neutral
28448     neutral
28451    negative
28453     neutral
Name: bert_label, Length: 24285, dtype: object

In [188]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=42)

In [138]:
# Number of trees in random forest
n_estimators = [int(x) for x in np.linspace(start = 200, stop = 2000, num = 10)]
# Number of features to consider at every split
max_features = ['auto', 'sqrt']
# Maximum number of levels in tree
max_depth = [int(x) for x in np.linspace(10, 110, num = 11)]
max_depth.append(None)
# Minimum number of samples required to split a node
min_samples_split = [2, 5, 10]
# Minimum number of samples required at each leaf node
min_samples_leaf = [1, 2, 4]
# Method of selecting samples for training each tree
bootstrap = [True, False]
# Create the random grid
random_grid = {'n_estimators': n_estimators,
               'max_features': max_features,
               'max_depth': max_depth,
               'min_samples_split': min_samples_split,
               'min_samples_leaf': min_samples_leaf,
               'bootstrap': bootstrap}

# Use the random grid to search for best hyperparameters
# First create the base model to tune
rf = RandomForestRegressor()
# Random search of parameters, using 3 fold cross validation,
# search across 100 different combinations, and use all available cores
rf_random = RandomizedSearchCV(estimator = rf, param_distributions = random_grid, n_iter = 5 , cv = 3, verbose = 3, random_state=42, n_jobs = -1)
# Fit the random search model
rf_random.fit(X_train,y_train)

rf_random.best_params_

Fitting 3 folds for each of 5 candidates, totalling 15 fits
[CV 1/3] END bootstrap=True, max_depth=50, max_features=sqrt, min_samples_leaf=2, min_samples_split=10, n_estimators=200;, score=0.046 total time=  21.5s
[CV 1/3] END bootstrap=True, max_depth=30, max_features=sqrt, min_samples_leaf=1, min_samples_split=5, n_estimators=1400;, score=0.049 total time= 2.8min




[CV 3/3] END bootstrap=True, max_depth=50, max_features=sqrt, min_samples_leaf=2, min_samples_split=10, n_estimators=200;, score=0.054 total time=  21.8s
[CV 2/3] END bootstrap=True, max_depth=30, max_features=sqrt, min_samples_leaf=1, min_samples_split=5, n_estimators=1400;, score=0.064 total time= 2.8min
[CV 1/3] END bootstrap=False, max_depth=90, max_features=sqrt, min_samples_leaf=4, min_samples_split=10, n_estimators=600;, score=0.049 total time= 1.6min
[CV 3/3] END bootstrap=True, max_depth=30, max_features=sqrt, min_samples_leaf=1, min_samples_split=5, n_estimators=1400;, score=0.059 total time= 2.8min
[CV 1/3] END bootstrap=False, max_depth=60, max_features=auto, min_samples_leaf=2, min_samples_split=2, n_estimators=600;, score=-0.783 total time=19.4min
[CV 2/3] END bootstrap=True, max_depth=50, max_features=sqrt, min_samples_leaf=2, min_samples_split=10, n_estimators=200;, score=0.058 total time=  21.2s
[CV 3/3] END bootstrap=False, max_depth=60, max_features=auto, min_samples

{'n_estimators': 600,
 'min_samples_split': 10,
 'min_samples_leaf': 4,
 'max_features': 'sqrt',
 'max_depth': 90,
 'bootstrap': False}

### Regressor

In [181]:
#regr = RandomForestRegressor(verbose=1, n_jobs=-1)
regr = RandomForestRegressor(verbose = 1, n_jobs = -1, n_estimators = 600, min_samples_split = 10, min_samples_leaf = 4, max_features = 'sqrt', max_depth = 90, bootstrap = False)
regr.fit(X_train, y_train)
regr.score(X_test, y_test)

[Parallel(n_jobs=-1)]: Using backend ThreadingBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done  34 tasks      | elapsed:    1.3s
[Parallel(n_jobs=-1)]: Done 184 tasks      | elapsed:    5.4s
[Parallel(n_jobs=-1)]: Done 434 tasks      | elapsed:   12.5s
[Parallel(n_jobs=-1)]: Done 600 out of 600 | elapsed:   17.3s finished
[Parallel(n_jobs=8)]: Using backend ThreadingBackend with 8 concurrent workers.
[Parallel(n_jobs=8)]: Done  34 tasks      | elapsed:    0.0s
[Parallel(n_jobs=8)]: Done 184 tasks      | elapsed:    0.1s
[Parallel(n_jobs=8)]: Done 434 tasks      | elapsed:    0.1s
[Parallel(n_jobs=8)]: Done 600 out of 600 | elapsed:    0.2s finished


0.07673947859024188

### Classifier

In [191]:
regr = RandomForestClassifier(verbose = 1, n_jobs = -1, n_estimators = 670, min_samples_split = 2, min_samples_leaf = 1, max_features = 'sqrt', max_depth = 89, criterion = 'entropy')
regr.fit(X_train,y_train)
regr.score(X_test,y_test)

[Parallel(n_jobs=-1)]: Using backend ThreadingBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done  34 tasks      | elapsed:    1.2s
[Parallel(n_jobs=-1)]: Done 184 tasks      | elapsed:    4.8s
[Parallel(n_jobs=-1)]: Done 434 tasks      | elapsed:   10.9s
[Parallel(n_jobs=-1)]: Done 670 out of 670 | elapsed:   17.0s finished
[Parallel(n_jobs=8)]: Using backend ThreadingBackend with 8 concurrent workers.
[Parallel(n_jobs=8)]: Done  34 tasks      | elapsed:    0.0s
[Parallel(n_jobs=8)]: Done 184 tasks      | elapsed:    0.1s
[Parallel(n_jobs=8)]: Done 434 tasks      | elapsed:    0.2s
[Parallel(n_jobs=8)]: Done 670 out of 670 | elapsed:    0.2s finished


0.6249532127261385