In [1]:
import pandas as pd
import numpy as np
import spacy
import nltk
from ast import literal_eval
from HanTa import HanoverTagger as ht
import string
from tqdm.auto import tqdm
from collections import Counter
import top2vec
import torch
from itertools import chain
from transformers import AutoModelForSequenceClassification, AutoTokenizer
from typing import List
import re
import pickle
import mgzip

tqdm.pandas()

# WELT Dataset comments sentiment classification
## Preprocessing
First, the original scraped text file is loaded (note: I use gzip compression due to the big size of the files). Since tThe original file is in the wide format, I use the melt function of pandas to transpose the data into the long format. This enables to compare the methods of sentiment classification more easily later.

In [12]:
welt = pd.read_csv('../data/welt_scraped.gzip', compression='gzip', low_memory=False)

In [13]:
# columns initially scraped but not necessary for the analysis anymore are deleted
# the article text is removed since this notebook is only for the sentiment analysis
# the article title serves as the ID of each article, since it is shorter than the article text
welt.drop(['Tag0', 'Tag1', 'Tag2', 'Tag3', 'combined_text', 'link'], axis=1, inplace=True)

In [14]:
comment_cols = [col for col in welt.columns if 'Comment' in col]

In [15]:
welt = pd.melt(welt, id_vars=['title', 'date'], value_vars=comment_cols)

In [16]:
welt.dropna(inplace=True)

### Tokenization & punctuation removal

In [19]:
welt['tokens'] = welt['value'].progress_apply(nltk.word_tokenize)

  0%|          | 0/1211850 [00:00<?, ?it/s]

In [20]:
def remove_punctuation(text):
    text_list = []
    for word in text:
        for punctuation in string.punctuation:
            word = word.replace(punctuation, '')
        if not word:
            continue
        text_list.append(word.lower())
    return text_list

In [None]:
welt['tokens'] = welt['tokens'].progress_apply(remove_punctuation)

### Stop word removal & lemmatization
Following, stopwords without meaning ('der', 'wo', etc.) are removed from the comments. I edited the sourced stopwords list and removed any negations, since these contain emotions that I want to capture later in the workflow. Afterwards, the remaining tokenized words for each comment are lemmatized, i.e. reduced to their basic form. In order to do this I utilise the 'Hannover Tagger', which also provides part of speech information, although I do not use it (the main reason that I do not use the PoS information is that the utilised German emotion and sentiment lexical use different PoS abbreviations, resulting in no matches).

In [22]:
stopwords = open('../resources/german_stopwords-master/german_stopwords_plain.txt').read().splitlines()

In [23]:
def stop_word_removal(x):
    return list([w for w in x if not w in stopwords])

In [24]:
welt['tokens'] = welt['tokens'].progress_apply(stop_word_removal)

  0%|          | 0/1211850 [00:00<?, ?it/s]

In [5]:
tagger = ht.HanoverTagger('morphmodel_ger.pgz')

In [26]:
def tagger_custom(input):
    tmp_list = []
    for word in input:
        tmp_list.append(tagger.analyze(word))
    return tmp_list

In [27]:
welt['tokens'] = welt['tokens'].progress_apply(tagger_custom)

  0%|          | 0/1211850 [00:00<?, ?it/s]

In [28]:
with mgzip.open("../data/welt_comments_preprocessed.mgzip", 'wb') as f:
    pickle.dump(welt, f)

In [2]:
# open from disk
with mgzip.open('../data/welt_comments_preprocessed.mgzip', 'rb') as handle:
    welt_pre = pickle.load(handle)

## Sentiment classification with lexicon (comments)
Similar to the previous step, the comments are also assigned to negative or positive sentiment overall, in a similar fashion. The result is saved in a new file with the ending '_senti_clas'.

In [10]:
senti_merge = pd.read_csv('../resources/sentimerge/data/sentimerge.txt', sep='\t')
senti_merge.drop(['PoS'], axis=1, inplace=True)

In [11]:
senti_merge_pos = senti_merge.loc[senti_merge['sentiment'] > 0]
senti_merge_neg = senti_merge.loc[senti_merge['sentiment'] < 0]

In [12]:
senti_pos_dict = dict(zip(senti_merge_pos['lemma'], senti_merge_pos['sentiment']))
senti_neg_dict = dict(zip(senti_merge_neg['lemma'], senti_merge_neg['sentiment']))
senti_weight_dict = dict(zip(senti_merge['lemma'], senti_merge['weight']))

In [13]:
def senti_class(input):
    tmp_list = []
    for word in input:
        if not word:
            continue
        x, y  = word
        if str(x).lower() in senti_pos_dict:
            tmp_list.append((senti_pos_dict[x.lower()]*senti_weight_dict[x.lower()]))
        if str(x).lower() in senti_neg_dict:
            tmp_list.append((senti_neg_dict[x.lower()]*senti_weight_dict[x.lower()]))
    if not tmp_list:
        return np.nan
    return sum(tmp_list)

In [14]:
def rescale(input):
    input -= input.min()
    input /= input.max()
    return input

In [15]:
def lexi_label(input):
    if input > 1:
        return 'positive'
    elif input < -1:
        return 'negative'
    elif pd.isna(input):
        return np.nan
    else:
        return 'neutral'

In [16]:
welt_pre['lexi_score'] = welt_pre['tokens'].progress_apply(senti_class)

  0%|          | 0/1211850 [00:00<?, ?it/s]

In [17]:
welt_pre['lexi_label'] = welt_pre['lexi_score'].apply(lexi_label)

In [18]:
welt_pre['lexi_score'] = rescale(welt_pre['lexi_score'])

In [None]:
import plotly.express as px
fig = px.histogram(welt_pre, x="lexi_score", nbins=1000)
fig.show()

In [19]:
welt_lexi = welt_pre[['title', 'value', 'lexi_score', 'lexi_label']]

In [21]:
welt_lexi

Unnamed: 0,title,value,lexi_score,lexi_label
0,Mitten im Spiel beschimpft Zverev seinen Vater,Hier mein Kommentar zu seinem ersten Spiel. „O...,0.453691,negative
1,"Seenotretter beschimpfen Kurz als ""Baby-Hitler""","SCHADE, dass wir in Deutschland nicht so einen...",0.611370,positive
2,Trump beschimpft getöteten General Soleimani a...,"Ich glaube, so langsam tun Hämorriden Herrn Pr...",0.564885,negative
3,Chinesisches Corona-Virus hat die USA erreicht,"WELT-Korrespondent Birger Nicolai hat über ""Di...",0.417291,negative
4,Was Sie über das Coronavirus wissen müssen,Vielen Dank für den informativen Artikel. Inte...,0.635331,positive
...,...,...,...,...
205095158,14.432 neue Fälle und 500 Tote – Corona-Zahlen...,"Wer sagt denn, ob unter den positiven PCR-Tes...",0.601666,positive
205102484,14.432 neue Fälle und 500 Tote – Corona-Zahlen...,Wieviele intensivmedizinisch betreute und beat...,0.596027,positive
205109810,14.432 neue Fälle und 500 Tote – Corona-Zahlen...,"Die ganzen Daten, welche uns vom RKI kommunizi...",0.526179,negative
205117136,14.432 neue Fälle und 500 Tote – Corona-Zahlen...,"Bei er aktuellen Datenlage, sollte dem Lockeru...",0.579603,negative


In [22]:
# saving to disk
with mgzip.open('../data/welt_lexi.mgzip', 'wb') as handle:
    pickle.dump(welt_lexi, handle)

## Sentiment classification with BERT (comments)
##### File was produced on Google Colab due to GPU acceleration, saved as 'welt_senti_clas_bert.gzip'.
In this step, the comments are classified for sentiment using the pretrained BERT model "GermanSentiment". This model was trained on 1.8 Million German language samples from various domains. This serves as a second method next to the lexicon approach and will be used to compare both methods. I modified the given code to also return the confidence logit values for each prediction. These are used to compare this and the lexicon methods.

In [52]:
class SentimentModel():
    def __init__(self, model_name: str):
        if torch.cuda.is_available():
            self.device = 'cuda'
        else:
            self.device = 'cpu'

        self.model = AutoModelForSequenceClassification.from_pretrained(model_name).to(self.device)
        self.tokenizer = AutoTokenizer.from_pretrained(model_name)

        self.clean_chars = re.compile(r'[^A-Za-züöäÖÜÄß ]', re.MULTILINE)
        self.clean_http_urls = re.compile(r'https*\\S+', re.MULTILINE)
        self.clean_at_mentions = re.compile(r'@\\S+', re.MULTILINE)

    def predict_sentiment(self, texts):
        if np.any(pd.isna(texts)):
          return np.nan
        texts = [self.clean_text(str(text)) for text in [texts]]
        # Add special tokens takes care of adding [CLS], [SEP], <s>... tokens in the right way for each model.
        encoded = self.tokenizer.batch_encode_plus(texts,padding=True, add_special_tokens=True,truncation=True, return_tensors="pt")
        encoded = encoded.to(self.device)
        with torch.no_grad():
                logits = self.model(**encoded)

        label_ids = torch.argmax(logits[0], axis=1)
        for a in zip(logits[0].tolist(), [self.model.config.id2label[label_id] for label_id in label_ids.tolist()]):
            tmp_rsl = list(a)
        return tmp_rsl

    def replace_numbers(self,text: str) -> str:
            return text.replace("0"," null").replace("1"," eins").replace("2"," zwei").replace("3"," drei").replace("4"," vier").replace("5"," fünf").replace("6"," sechs").replace("7"," sieben").replace("8"," acht").replace("9"," neun")

    def clean_text(self,text: str)-> str:
            text = text.replace("\n", " ")
            text = self.clean_http_urls.sub('',text)
            text = self.clean_at_mentions.sub('',text)
            text = self.replace_numbers(text)
            text = self.clean_chars.sub('', text) # use only text chars
            text = ' '.join(text.split()) # substitute multiple whitespace with single whitespace
            text = text.strip().lower()
            return text

In [53]:
model = SentimentModel(model_name = "oliverguhr/german-sentiment-bert")

In [56]:
welt_pre['bert'] = welt_pre['value'].progress_apply(model.predict_sentiment)

  0%|          | 0/454564 [00:00<?, ?it/s]

KeyboardInterrupt: 

In [61]:
def bert_score(input):
    return np.ptp(sorted(list(map(abs, input[0]))))

def bert_label(input):
    return input[1]

In [62]:
welt_pre['bert_score'] = welt_pre['bert'].apply(bert_score)

In [63]:
welt_pre['bert_label'] = welt_pre['bert'].apply(bert_label)

In [2]:
rescale(welt_pre['bert_score'])

In [None]:
welt_bert = welt_pre[['title', 'bert_score', 'bert_label']]

In [None]:
with mgzip.open(".../data/welt_bert.mgzip", 'wb') as f:
    pickle.dump(welt_bert, f)

## SSentiA comments (comparison of lexicon & BERT methods)

### Predicting labels
For this step, I compare the predictions of both prior methods and pick the ones with the highest confident value. Further, the high confident predictions will be used to train other ML models to improve low confidence predictions.

In [3]:
# load command
with mgzip.open('../data/welt_lexi.mgzip', 'rb') as handle:
    welt_lexi = pickle.load(handle)

In [3]:
# load command
with mgzip.open('../data/welt_bert.mgzip', 'rb') as handle:
    welt_bert = pickle.load(handle)

In [4]:
welt_senti_compare = pd.DataFrame()

In [5]:
welt_senti_compare[['title', 'comment', 'lexi_score','lexi_label']] = welt_lexi[['title', 'value', 'lexi_score','lexi_label']]
welt_senti_compare[['bert_score','bert_label']] = welt_bert[['bert_score','bert_label']]

In [6]:
welt_senti_compare['vote'] = np.where((welt_senti_compare['lexi_score'] > welt_senti_compare['bert_score']), 'lexi', 'bert')
welt_senti_compare['vote_score'] = np.where((welt_senti_compare['lexi_score'] > welt_senti_compare['bert_score']), welt_senti_compare['lexi_score'], welt_senti_compare['bert_score'])
welt_senti_compare['vote_label'] = np.where((welt_senti_compare['lexi_score'] > welt_senti_compare['bert_score']), welt_senti_compare['lexi_label'], welt_senti_compare['bert_label'])

In [7]:
welt_senti_compare.drop(['lexi_score','lexi_label', 'bert_label', 'bert_score'], axis=1, inplace=True)

In [9]:
# saving to disk
with mgzip.open('../data/welt_ssentia.mgzip', 'wb') as handle:
    pickle.dump(welt_senti_compare, handle)

In [2]:
# load command
with mgzip.open('../data/welt_ssentia.mgzip', 'rb') as handle:
    welt_senti_compare = pickle.load(handle)

In [3]:
train = welt_senti_compare[welt_senti_compare['vote_score'] > welt_senti_compare['vote_score'].mean() + 0.5 * welt_senti_compare['vote_score'].std()]
test = welt_senti_compare[welt_senti_compare['vote_score'] < welt_senti_compare['vote_score'].mean() + 0.5 * welt_senti_compare['vote_score'].std()]
train.drop(['title', 'vote_score', 'vote'], axis=1, inplace=True)
test.drop(['title', 'vote_score', 'vote'], axis=1, inplace=True)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  return super().drop(


In [4]:
X_train = train.loc[:, train.columns != 'vote_label']
y_train = train['vote_label']
X_test = test.loc[:, test.columns != 'vote_label']
y_test = test['vote_label']

In [5]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import RandomizedSearchCV
from sklearn.model_selection import TimeSeriesSplit, GridSearchCV
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import LabelEncoder

In [6]:
encoder = LabelEncoder()
vectorizer = TfidfVectorizer(ngram_range=(0,1), tokenizer=lambda x: x.split())
X_train = vectorizer.fit_transform(X_train['comment'])
X_test = vectorizer.transform(X_test['comment'])
y_train = encoder.fit_transform(y_train)
y_test = encoder.transform(y_test)

In [60]:
# Number of trees in random forest
n_estimators = [int(x) for x in np.linspace(start = 10, stop = 1000, num = 10)]
# Criterion
criterion =['gini', 'entropy']
# Number of features to consider at every split
max_features = ['log2', 'sqrt', 'auto']
# Maximum number of levels in tree
max_depth = [int(x) for x in np.linspace(1, 100, num = 10)]
max_depth.append(None)
# Minimum number of samples required to split a node
min_samples_split = [2, 5, 10]
# Minimum number of samples required at each leaf node
min_samples_leaf = [1, 2, 4]
# Method of selecting samples for training each tree
bootstrap = [True, False]
# Create the random grid
random_grid = {'n_estimators': n_estimators,
               'criterion': criterion,
               'max_features': max_features,
               'max_depth': max_depth,
               'min_samples_split': min_samples_split,
               'min_samples_leaf': min_samples_leaf}

# Use the random grid to search for best hyperparameters
# First create the base model to tune
rf = RandomForestClassifier()
# Random search of parameters, using 3 fold cross validation,
# search across 100 different combinations, and use all available cores
rf_random = RandomizedSearchCV(estimator = rf, param_distributions = random_grid, n_iter = 5 , cv = 3, verbose = 3, random_state=42, n_jobs = -1)
# Fit the random search model
rf_random.fit(X_train,y_train)

rf_random.best_params_

Fitting 3 folds for each of 5 candidates, totalling 15 fits
[CV 2/3; 2/5] START criterion=entropy, max_depth=89, max_features=auto, min_samples_leaf=4, min_samples_split=10, n_estimators=10
[CV 2/3; 2/5] END criterion=entropy, max_depth=89, max_features=auto, min_samples_leaf=4, min_samples_split=10, n_estimators=10;, score=nan total time=   0.1s
[CV 2/3; 5/5] START criterion=entropy, max_depth=23, max_features=auto, min_samples_leaf=4, min_samples_split=10, n_estimators=230
[CV 2/3; 5/5] END criterion=entropy, max_depth=23, max_features=auto, min_samples_leaf=4, min_samples_split=10, n_estimators=230;, score=nan total time=   0.0s
[CV 2/3; 2/5] START criterion=entropy, max_depth=89, max_features=auto, min_samples_leaf=4, min_samples_split=10, n_estimators=10
[CV 2/3; 2/5] END criterion=entropy, max_depth=89, max_features=auto, min_samples_leaf=4, min_samples_split=10, n_estimators=10;, score=nan total time=   0.0s
[CV 3/3; 4/5] START criterion=entropy, max_depth=89, max_features=log2,

[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    3.4s remaining:    0.0s
[Parallel(n_jobs=1)]: Done   2 out of   2 | elapsed:    5.1s remaining:    0.0s
[Parallel(n_jobs=1)]: Done   3 out of   3 | elapsed:    7.2s remaining:    0.0s
[Parallel(n_jobs=1)]: Done   4 out of   4 | elapsed:    9.1s remaining:    0.0s
[Parallel(n_jobs=1)]: Done   5 out of   5 | elapsed:   10.8s remaining:    0.0s
[Parallel(n_jobs=1)]: Done   6 out of   6 | elapsed:   12.7s remaining:    0.0s
[Parallel(n_jobs=1)]: Done   7 out of   7 | elapsed:   14.7s remaining:    0.0s
[Parallel(n_jobs=1)]: Done   8 out of   8 | elapsed:   16.4s remaining:    0.0s
[Parallel(n_jobs=1)]: Done   9 out of   9 | elapsed:   18.2s remaining:    0.0s
[Parallel(n_jobs=1)]: Done  10 out of  10 | elapsed:   20.1s finished
[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done   1 out of   1 | elap

[CV 1/3; 2/5] START criterion=entropy, max_depth=89, max_features=auto, min_samples_leaf=4, min_samples_split=10, n_estimators=10
[CV 1/3; 2/5] END criterion=entropy, max_depth=89, max_features=auto, min_samples_leaf=4, min_samples_split=10, n_estimators=10;, score=nan total time=   0.1s
[CV 1/3; 4/5] START criterion=entropy, max_depth=89, max_features=log2, min_samples_leaf=4, min_samples_split=2, n_estimators=120
[CV 1/3; 4/5] END criterion=entropy, max_depth=89, max_features=log2, min_samples_leaf=4, min_samples_split=2, n_estimators=120;, score=nan total time=   0.0s
[CV 3/3; 1/5] START criterion=gini, max_depth=34, max_features=log2, min_samples_leaf=2, min_samples_split=10, n_estimators=10
[CV 3/3; 1/5] END criterion=gini, max_depth=34, max_features=log2, min_samples_leaf=2, min_samples_split=10, n_estimators=10;, score=nan total time=   0.1s
[CV 3/3; 3/5] START criterion=entropy, max_depth=89, max_features=sqrt, min_samples_leaf=1, min_samples_split=2, n_estimators=670
[CV 3/3; 

[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    0.0s remaining:    0.0s
[Parallel(n_jobs=1)]: Done   2 out of   2 | elapsed:    0.1s remaining:    0.0s
[Parallel(n_jobs=1)]: Done   3 out of   3 | elapsed:    0.2s remaining:    0.0s
[Parallel(n_jobs=1)]: Done   4 out of   4 | elapsed:    0.4s remaining:    0.0s
[Parallel(n_jobs=1)]: Done   5 out of   5 | elapsed:    0.8s remaining:    0.0s
[Parallel(n_jobs=1)]: Done   6 out of   6 | elapsed:    1.0s remaining:    0.0s
[Parallel(n_jobs=1)]: Done   7 out of   7 | elapsed:    1.1s remaining:    0.0s
[Parallel(n_jobs=1)]: Done   8 out of   8 | elapsed:    1.1s remaining:    0.0s
[Parallel(n_jobs=1)]: Done   9 out of   9 | elapsed:    1.3s remaining:    0.0s
[Parallel(n_jobs=1)]: Done  10 out of  10 | elapsed:    1.4s finished
[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done   1 out of   1 | elap

[CV 2/3; 3/5] START criterion=entropy, max_depth=89, max_features=sqrt, min_samples_leaf=1, min_samples_split=2, n_estimators=670
[CV 2/3; 3/5] END criterion=entropy, max_depth=89, max_features=sqrt, min_samples_leaf=1, min_samples_split=2, n_estimators=670;, score=nan total time=   0.0s
[CV 1/3; 1/5] START criterion=gini, max_depth=34, max_features=log2, min_samples_leaf=2, min_samples_split=10, n_estimators=10
[CV 1/3; 1/5] END criterion=gini, max_depth=34, max_features=log2, min_samples_leaf=2, min_samples_split=10, n_estimators=10;, score=nan total time=   0.1s
[CV 1/3; 4/5] START criterion=entropy, max_depth=89, max_features=log2, min_samples_leaf=4, min_samples_split=2, n_estimators=120
[CV 1/3; 4/5] END criterion=entropy, max_depth=89, max_features=log2, min_samples_leaf=4, min_samples_split=2, n_estimators=120;, score=nan total time=   0.0s
[CV 1/3; 2/5] START criterion=entropy, max_depth=89, max_features=auto, min_samples_leaf=4, min_samples_split=10, n_estimators=10
building 

[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    3.3s remaining:    0.0s
[Parallel(n_jobs=1)]: Done   2 out of   2 | elapsed:    5.5s remaining:    0.0s
[Parallel(n_jobs=1)]: Done   3 out of   3 | elapsed:    7.6s remaining:    0.0s
[Parallel(n_jobs=1)]: Done   4 out of   4 | elapsed:    9.6s remaining:    0.0s
[Parallel(n_jobs=1)]: Done   5 out of   5 | elapsed:   11.3s remaining:    0.0s
[Parallel(n_jobs=1)]: Done   6 out of   6 | elapsed:   12.9s remaining:    0.0s
[Parallel(n_jobs=1)]: Done   7 out of   7 | elapsed:   14.7s remaining:    0.0s
[Parallel(n_jobs=1)]: Done   8 out of   8 | elapsed:   16.4s remaining:    0.0s
[Parallel(n_jobs=1)]: Done   9 out of   9 | elapsed:   18.3s remaining:    0.0s
[Parallel(n_jobs=1)]: Done  10 out of  10 | elapsed:   19.9s finished
[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done   1 out of   1 | elap

[CV 1/3; 3/5] START criterion=entropy, max_depth=89, max_features=sqrt, min_samples_leaf=1, min_samples_split=2, n_estimators=670
[CV 1/3; 3/5] END criterion=entropy, max_depth=89, max_features=sqrt, min_samples_leaf=1, min_samples_split=2, n_estimators=670;, score=nan total time=   0.0s
[CV 2/3; 4/5] START criterion=entropy, max_depth=89, max_features=log2, min_samples_leaf=4, min_samples_split=2, n_estimators=120
[CV 2/3; 4/5] END criterion=entropy, max_depth=89, max_features=log2, min_samples_leaf=4, min_samples_split=2, n_estimators=120;, score=nan total time=   0.0s
[CV 2/3; 5/5] START criterion=entropy, max_depth=23, max_features=auto, min_samples_leaf=4, min_samples_split=10, n_estimators=230
[CV 2/3; 5/5] END criterion=entropy, max_depth=23, max_features=auto, min_samples_leaf=4, min_samples_split=10, n_estimators=230;, score=nan total time=   0.0s
[CV 2/3; 1/5] START criterion=gini, max_depth=34, max_features=log2, min_samples_leaf=2, min_samples_split=10, n_estimators=10
buil

[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    1.0s remaining:    0.0s


building tree 2 of 670


[Parallel(n_jobs=1)]: Done   2 out of   2 | elapsed:    1.8s remaining:    0.0s


building tree 3 of 670


[Parallel(n_jobs=1)]: Done   3 out of   3 | elapsed:    2.7s remaining:    0.0s


building tree 4 of 670


[Parallel(n_jobs=1)]: Done   4 out of   4 | elapsed:    3.5s remaining:    0.0s


building tree 5 of 670


[Parallel(n_jobs=1)]: Done   5 out of   5 | elapsed:    4.4s remaining:    0.0s


building tree 6 of 670


[Parallel(n_jobs=1)]: Done   6 out of   6 | elapsed:    5.1s remaining:    0.0s


building tree 7 of 670


[Parallel(n_jobs=1)]: Done   7 out of   7 | elapsed:    6.0s remaining:    0.0s


building tree 8 of 670


[Parallel(n_jobs=1)]: Done   8 out of   8 | elapsed:    6.8s remaining:    0.0s


building tree 9 of 670


[Parallel(n_jobs=1)]: Done   9 out of   9 | elapsed:    7.7s remaining:    0.0s


building tree 10 of 670
building tree 11 of 670
building tree 12 of 670
building tree 13 of 670
building tree 14 of 670
building tree 15 of 670
building tree 16 of 670
building tree 17 of 670
building tree 18 of 670
building tree 19 of 670
building tree 20 of 670
building tree 21 of 670
building tree 22 of 670
building tree 23 of 670
building tree 24 of 670
building tree 25 of 670
building tree 26 of 670
building tree 27 of 670
building tree 28 of 670
building tree 29 of 670
building tree 30 of 670
building tree 31 of 670
building tree 32 of 670
building tree 33 of 670
building tree 34 of 670
building tree 35 of 670
building tree 36 of 670
building tree 37 of 670
building tree 38 of 670
building tree 39 of 670
building tree 40 of 670
building tree 41 of 670
building tree 42 of 670
building tree 43 of 670
building tree 44 of 670
building tree 45 of 670
building tree 46 of 670
building tree 47 of 670
building tree 48 of 670
building tree 49 of 670
building tree 50 of 670
building tree 51

[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    6.7s remaining:    0.0s
[Parallel(n_jobs=1)]: Done   2 out of   2 | elapsed:   11.1s remaining:    0.0s
[Parallel(n_jobs=1)]: Done   3 out of   3 | elapsed:   15.1s remaining:    0.0s
[Parallel(n_jobs=1)]: Done   4 out of   4 | elapsed:   19.8s remaining:    0.0s
[Parallel(n_jobs=1)]: Done   5 out of   5 | elapsed:   23.6s remaining:    0.0s
[Parallel(n_jobs=1)]: Done   6 out of   6 | elapsed:   27.3s remaining:    0.0s
[Parallel(n_jobs=1)]: Done   7 out of   7 | elapsed:   30.6s remaining:    0.0s
[Parallel(n_jobs=1)]: Done   8 out of   8 | elapsed:   33.7s remaining:    0.0s
[Parallel(n_jobs=1)]: Done   9 out of   9 | elapsed:   37.0s remaining:    0.0s
[Parallel(n_jobs=1)]: Done 670 out of 670 | elapsed: 31.2min finished
[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done   1 out of   1 | elap

building tree 311 of 670
building tree 312 of 670
building tree 313 of 670
building tree 314 of 670
[CV 2/3; 1/5] START criterion=gini, max_depth=34, max_features=log2, min_samples_leaf=2, min_samples_split=10, n_estimators=10
[CV 2/3; 1/5] END criterion=gini, max_depth=34, max_features=log2, min_samples_leaf=2, min_samples_split=10, n_estimators=10;, score=nan total time=   0.1s
[CV 1/3; 5/5] START criterion=entropy, max_depth=23, max_features=auto, min_samples_leaf=4, min_samples_split=10, n_estimators=230
[CV 1/3; 5/5] END criterion=entropy, max_depth=23, max_features=auto, min_samples_leaf=4, min_samples_split=10, n_estimators=230;, score=nan total time=   0.0s
[CV 1/3; 3/5] START criterion=entropy, max_depth=89, max_features=sqrt, min_samples_leaf=1, min_samples_split=2, n_estimators=670
[CV 1/3; 3/5] END criterion=entropy, max_depth=89, max_features=sqrt, min_samples_leaf=1, min_samples_split=2, n_estimators=670;, score=nan total time=   0.0s
[CV 2/3; 3/5] START criterion=entropy

[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    6.0s remaining:    0.0s
[Parallel(n_jobs=1)]: Done   2 out of   2 | elapsed:   10.4s remaining:    0.0s
[Parallel(n_jobs=1)]: Done   3 out of   3 | elapsed:   14.7s remaining:    0.0s
[Parallel(n_jobs=1)]: Done   4 out of   4 | elapsed:   19.0s remaining:    0.0s
[Parallel(n_jobs=1)]: Done   5 out of   5 | elapsed:   23.2s remaining:    0.0s
[Parallel(n_jobs=1)]: Done   6 out of   6 | elapsed:   26.8s remaining:    0.0s
[Parallel(n_jobs=1)]: Done   7 out of   7 | elapsed:   30.4s remaining:    0.0s
[Parallel(n_jobs=1)]: Done   8 out of   8 | elapsed:   33.8s remaining:    0.0s
[Parallel(n_jobs=1)]: Done   9 out of   9 | elapsed:   37.2s remaining:    0.0s
[Parallel(n_jobs=1)]: Done 670 out of 670 | elapsed: 31.3min finished
[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done   1 out of   1 | elap

building tree 315 of 670
building tree 316 of 670
building tree 317 of 670
building tree 318 of 670
building tree 319 of 670
building tree 320 of 670
building tree 321 of 670
building tree 322 of 670
building tree 323 of 670
building tree 324 of 670
building tree 325 of 670
building tree 326 of 670
building tree 327 of 670
building tree 328 of 670
building tree 329 of 670
building tree 330 of 670
building tree 331 of 670
building tree 332 of 670
building tree 333 of 670
building tree 334 of 670
building tree 335 of 670
building tree 336 of 670
building tree 337 of 670
building tree 338 of 670
building tree 339 of 670
building tree 340 of 670
building tree 341 of 670
building tree 342 of 670
building tree 343 of 670
building tree 344 of 670
building tree 345 of 670
building tree 346 of 670
building tree 347 of 670
building tree 348 of 670
building tree 349 of 670
building tree 350 of 670
building tree 351 of 670
building tree 352 of 670
building tree 353 of 670
building tree 354 of 670


[Parallel(n_jobs=1)]: Done 670 out of 670 | elapsed: 10.9min finished


{'n_estimators': 670,
 'min_samples_split': 2,
 'min_samples_leaf': 1,
 'max_features': 'sqrt',
 'max_depth': 89,
 'criterion': 'entropy'}

In [7]:
regr = RandomForestClassifier(verbose=1, n_jobs=-1)
#regr = RandomForestClassifier(verbose = 1, n_jobs = -1, n_estimators = 670, min_samples_split = 2, min_samples_leaf = 1, max_features = 'sqrt', max_depth = 89, criterion = 'entropy')
regr.fit(X_train,y_train)
regr.score(X_test,y_test)

[Parallel(n_jobs=-1)]: Using backend ThreadingBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done  34 tasks      | elapsed:  2.3min
[Parallel(n_jobs=-1)]: Done 100 out of 100 | elapsed:  5.9min finished
[Parallel(n_jobs=8)]: Using backend ThreadingBackend with 8 concurrent workers.
[Parallel(n_jobs=8)]: Done  34 tasks      | elapsed:   24.4s
[Parallel(n_jobs=8)]: Done 100 out of 100 | elapsed:   47.7s finished


0.20406667532119724

In [8]:
# predict labels for the whole data
X = welt_senti_compare['comment']
X = vectorizer.transform(X)

In [9]:
welt_senti_compare['ssentia_label'] = regr.predict(X)

[Parallel(n_jobs=8)]: Using backend ThreadingBackend with 8 concurrent workers.
[Parallel(n_jobs=8)]: Done  34 tasks      | elapsed:   24.2s
[Parallel(n_jobs=8)]: Done 100 out of 100 | elapsed:   52.6s finished


In [10]:
welt_senti_compare['ssentia_label'] = np.where(welt_senti_compare['ssentia_label'] == 2, 'positive', np.where(welt_senti_compare['ssentia_label'] == 1, 'neutral', 'negative'))

### Finding most numerous mode

In [20]:
welt_group = welt_senti_compare[['title', 'vote_label', 'vote_score']]

In [21]:
welt_group['vote_value'] = np.where(welt_group['vote_label'] == 'positive', 3, np.where(welt_group['vote_label'] == 'neutral', 2, 1))

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  welt_group['vote_value'] = np.where(welt_group['vote_label'] == 'positive', 3, np.where(welt_group['vote_label'] == 'neutral', 2, 1))


In [22]:
# general grouping
#welt_group = welt_group.groupby('title', as_index = False)[['vote_label', 'ssentia_label']].agg(vote=('vote_label', lambda x: pd.Series.mode(x)[0]), ssentia=('ssentia_label', lambda x: pd.Series.mode(x)[0]), count=('vote_label', np.count_nonzero))

In [23]:
# vote_value grouping with mean
welt_group = welt_group.groupby(['title', 'vote_label'], as_index = False)['vote_value', 'vote_label', 'vote_score'].agg(value = ('vote_value', np.mean),count=('vote_label', np.count_nonzero),conf_value=('vote_score', np.mean))

  welt_group = welt_group.groupby(['title', 'vote_label'], as_index = False)['vote_value', 'vote_label', 'vote_score'].agg(value = ('vote_value', np.mean),count=('vote_label', np.count_nonzero),conf_value=('vote_score', np.mean))


In [24]:
# log of counts in order to model the amount of comments it got but not overemphasize outliers too much
welt_group['log_count'] = np.log(welt_group['count'])

In [25]:
welt_group['anteil_intern'] = welt_group['count'] / welt_group.groupby('title')['count'].transform('sum')

In [26]:
welt_group['final'] = (np.log(welt_group['conf_value'])*(-1))*welt_group['value']*np.sqrt(welt_group['log_count']*welt_group['anteil_intern'])

In [27]:
welt_group_value = welt_group.groupby(['title'], as_index = False)['value', 'count', 'final'].agg(calc=('final', np.mean), value=('value', np.mean), count=('count', np.sum))

  welt_group_value = welt_group.groupby(['title'], as_index = False)['value', 'count', 'final'].agg(calc=('final', np.mean), value=('value', np.mean), count=('count', np.sum))


In [28]:
welt_group_value

Unnamed: 0,title,calc,value,count
0,\nBundesfamilienministerin Spiegel wirbt für K...,1.146114,2.0,500
1,"""Alberne Einschränkungen für doppelt Geimpfte...",0.319030,2.0,7
2,"""Jingle Bells"" - US-Festtagsstimmung trotz Co...",0.519983,1.0,2
3,Einschränkungen für Feiern – auch in den eige...,0.000000,1.0,1
4,Eishockey-Profi leidet an Herzmuskel-Entzündung,0.283777,1.5,4
...,...,...,...,...
6450,Über Nacht erschafft Trump in Amerika das bish...,1.306114,2.0,1300
6451,Über eine Milliarde Euro – Deutsche Bank verbu...,0.520038,2.0,10
6452,Übergangslager auf Lesbos fast voll – 213 Coro...,0.858500,2.0,53
6453,Übernahmeflug von Griechenland nach Deutschlan...,1.012764,2.0,175


In [29]:
# saving to disk
with mgzip.open('../data/welt_predict_new.mgzip', 'wb') as handle:
    pickle.dump(welt_group_value, handle)

In [None]:
from sklearn.svm import SVC

In [None]:
svc = SVC()
svc.fit(X_train,y_train)

In [None]:
svc.score(X_test,y_test)

In [377]:
param_grid = {'C': [0.1,1, 10, 100], 'gamma': [1,0.1,0.01,0.001],'kernel': ['rbf', 'poly', 'sigmoid']}

grid = GridSearchCV(SVC(),param_grid,refit=True,verbose=1, n_jobs=-1)
grid.fit(X_train,y_train)

Fitting 5 folds for each of 48 candidates, totalling 240 fits


KeyboardInterrupt: 