In [1]:
import pandas as pd
import numpy as np
import spacy
import nltk
from ast import literal_eval
from HanTa import HanoverTagger as ht
import string
from tqdm.auto import tqdm
from collections import Counter
import top2vec
import torch
from itertools import chain
from transformers import AutoModelForSequenceClassification, AutoTokenizer
from typing import List
import re
import pickle
import mgzip

tqdm.pandas()

# tagesspiegel Dataset comments sentiment classification
## Preprocessing
First, the original scraped text file is loaded (note: I use gzip compression due to the big size of the files). Since tThe original file is in the wide format, I use the melt function of pandas to transpose the data into the long format. This enables to compare the methods of sentiment classification more easily later.

In [None]:
tagesspiegel = pd.read_csv('../data/tagesspiegel_scraped.gzip', compression='gzip', low_memory=False)

In [None]:
# columns initially scraped but not necessary for the analysis anymore are deleted
# the article text is removed since this notebook is only for the sentiment analysis
# the article title serves as the ID of each article, since it is shorter than the article text
tagesspiegel.drop(['Unnamed: 0', 'combined_text', 'link'], axis=1, inplace=True)

In [None]:
comment_cols = [col for col in tagesspiegel.columns if 'Comment' in col]

In [None]:
tagesspiegel = pd.melt(tagesspiegel, id_vars=['title', 'date'], value_vars=comment_cols)

In [None]:
tagesspiegel.dropna(inplace=True)

### Tokenization & punctuation removal

In [None]:
tagesspiegel['tokens'] = tagesspiegel['value'].progress_apply(nltk.word_tokenize)

In [None]:
def remove_punctuation(text):
    text_list = []
    for word in text:
        for punctuation in string.punctuation:
            word = word.replace(punctuation, '')
        if not word:
            continue
        text_list.append(word.lower())
    return text_list

In [None]:
tagesspiegel['tokens'] = tagesspiegel['tokens'].progress_apply(remove_punctuation)

### Stop word removal & lemmatization
Following, stopwords without meaning ('der', 'wo', etc.) are removed from the comments. I edited the sourced stopwords list and removed any negations, since these contain emotions that I want to capture later in the workflow. Afterwards, the remaining tokenized words for each comment are lemmatized, i.e. reduced to their basic form. In order to do this I utilise the 'Hannover Tagger', which also provides part of speech information, although I do not use it (the main reason that I do not use the PoS information is that the utilised German emotion and sentiment lexical use different PoS abbreviations, resulting in no matches).

In [None]:
stopwords = open('../resources/german_stopwords-master/german_stopwords_plain.txt').read().splitlines()

In [None]:
def stop_word_removal(x):
    return list([w for w in x if not w in stopwords])

In [None]:
tagesspiegel['tokens'] = tagesspiegel['tokens'].progress_apply(stop_word_removal)

In [None]:
tagger = ht.HanoverTagger('morphmodel_ger.pgz')

In [None]:
def tagger_custom(input):
    tmp_list = []
    for word in input:
        tmp_list.append(tagger.analyze(word))
    return tmp_list

In [None]:
tagesspiegel['tokens'] = tagesspiegel['tokens'].progress_apply(tagger_custom)

In [None]:
with mgzip.open("../data/tagesspiegel_comments_preprocessed.mgzip", 'wb') as f:
    pickle.dump(tagesspiegel, f)

In [None]:
# open from disk
with mgzip.open('../data/tagesspiegel_comments_preprocessed.mgzip', 'rb') as handle:
    tagesspiegel_pre = pickle.load(handle)

### Lemmatization of emotion lexicon
The utilised emotion lexicon by Robert Klinger is also lemmatized, in order to be able to match the lemmatized words in the comments to the ones in the lexicals. Note: I used an IF statement per lexicon since I did not manage to make it work in a combined FOR loop.

In [None]:
ekel = open('../resources/german-emotion-dictionary/Ekel.txt').read().splitlines()
freude = open('../resources/german-emotion-dictionary/Freude.txt').read().splitlines()
furcht = open('../resources/german-emotion-dictionary/Furcht.txt').read().splitlines()
trauer = open('../resources/german-emotion-dictionary/Trauer.txt').read().splitlines()
surprise = open('../resources/german-emotion-dictionary/Ueberraschung.txt').read().splitlines()
verachtung = open('../resources/german-emotion-dictionary/Verachtung.txt').read().splitlines()
wut = open('../resources/german-emotion-dictionary/Wut.txt').read().splitlines()

In [None]:
tmp_list = []
for words in ekel:
   tmp_list.append(tagger.analyze(words))
ekel = tmp_list
tmp_list = []
for emotion in ekel:
    x, y = emotion
    tmp_list.append(x.lower())
ekel = tmp_list

tmp_list = []
for words in freude:
    tmp_list.append(tagger.analyze(words))
freude = tmp_list
tmp_list = []
for emotion in freude:
    x, y = emotion
    tmp_list.append(x.lower())
freude = tmp_list

tmp_list = []
for words in furcht:
    tmp_list.append(tagger.analyze(words))
furcht = tmp_list
tmp_list = []
for emotion in furcht:
    x, y = emotion
    tmp_list.append(x.lower())
furcht = tmp_list

tmp_list = []
for words in trauer:
    tmp_list.append(tagger.analyze(words))
trauer = tmp_list
tmp_list = []
for emotion in trauer:
    x, y = emotion
    tmp_list.append(x.lower())
trauer = tmp_list

tmp_list = []
for words in surprise:
    tmp_list.append(tagger.analyze(words))
surprise = tmp_list
tmp_list = []
for emotion in surprise:
    x, y = emotion
    tmp_list.append(x.lower())
surprise = tmp_list

tmp_list = []
for words in verachtung:
    tmp_list.append(tagger.analyze(words))
verachtung = tmp_list
tmp_list = []
for emotion in verachtung:
    x, y = emotion
    tmp_list.append(x.lower())
verachtung = tmp_list

tmp_list = []
for words in wut:
    tmp_list.append(tagger.analyze(words))
wut = tmp_list
tmp_list = []
for emotion in wut:
    x, y = emotion
    tmp_list.append(x.lower())
wut = tmp_list

## Lexicon emotion classification (comments)
In this step, I assign every word in the tokenized and lemmatized comments to an emotion (if there is a match). Afterwards, for every comment the dominant emotion is detected and assigned as the overall emotion of the comment. This leaves me with one emotion per comment, which are added into one large list per news article. The resulting file is saved with the ending '_emo_clas'.

In [None]:
def emo_class(input):
    tmp_list = []
    for word in input:
        if not word:
            continue
        x, y  = word
        if str(x).lower() in ekel:
            tmp_list.append('ekel')
        if str(x).lower() in freude:
            tmp_list.append('freude')
        if str(x).lower() in furcht:
            tmp_list.append('furcht')
        if str(x).lower() in trauer:
            tmp_list.append('trauer')
        if str(x).lower() in surprise:
            tmp_list.append('surprise')
        if str(x).lower() in verachtung:
            tmp_list.append('verachtung')
        if str(x).lower() in wut:
            tmp_list.append('wut')
    if not tmp_list:
        return None
    return Counter(tmp_list).most_common(1)[0][0]

In [None]:
tagesspiegel_pre['emotion'] = tagesspiegel_pre['tokens'].progress_apply(emo_class)

## Sentiment classification with lexicon (comments)
Similar to the previous step, the comments are also assigned to negative or positive sentiment overall, in a similar fashion. The result is saved in a new file with the ending '_senti_clas'.

In [None]:
senti_merge = pd.read_csv('../resources/sentimerge/data/sentimerge.txt', sep='\t')
senti_merge.drop(['PoS'], axis=1, inplace=True)

In [None]:
senti_merge_pos = senti_merge.loc[senti_merge['sentiment'] > 0]
senti_merge_neg = senti_merge.loc[senti_merge['sentiment'] < 0]

In [None]:
senti_pos_dict = dict(zip(senti_merge_pos['lemma'], senti_merge_pos['sentiment']))
senti_neg_dict = dict(zip(senti_merge_neg['lemma'], senti_merge_neg['sentiment']))
senti_weight_dict = dict(zip(senti_merge['lemma'], senti_merge['weight']))

In [None]:
def senti_class(input):
    tmp_list = []
    for word in input:
        if not word:
            continue
        x, y  = word
        if str(x).lower() in senti_pos_dict:
            tmp_list.append((senti_pos_dict[x.lower()]*senti_weight_dict[x.lower()]))
        if str(x).lower() in senti_neg_dict:
            tmp_list.append((senti_neg_dict[x.lower()]*senti_weight_dict[x.lower()]))
    if not tmp_list:
        return np.nan
    return sum(tmp_list)

In [132]:
def rescale(input):
    scale = input
    scale -= scale.min()
    scale /= scale.max()
    return scale

In [None]:
def lexi_label(input):
    if input > 1:
        return 'positive'
    elif input < -1:
        return 'negative'
    elif pd.isna(input):
        return np.nan
    else:
        return 'neutral'

In [None]:
tagesspiegel_pre['lexi_score'] = tagesspiegel_pre['tokens'].progress_apply(senti_class)

In [None]:
tagesspiegel_pre['lexi_label'] = tagesspiegel_pre['lexi_score'].apply(lexi_label)

In [None]:
tagesspiegel_pre['lexi_score'] = rescale(tagesspiegel_pre['lexi_score'])

In [None]:
import plotly.express as px
fig = px.histogram(tagesspiegel_pre, x="lexi_score", nbins=1000)
fig.show()

In [None]:
tagesspiegel_lexi = tagesspiegel_pre[['title', 'value', 'lexi_score', 'lexi_label']]

In [None]:
# saving to disk
with mgzip.open('../data/tagesspiegel_lexi.mgzip', 'wb') as handle:
    pickle.dump(tagesspiegel_lexi, handle)

## Sentiment classification with BERT (comments)
##### File was produced on Google Colab due to GPU acceleration, saved as 'tagesspiegel_senti_clas_bert.gzip'.
In this step, the comments are classified for sentiment using the pretrained BERT model "GermanSentiment". This model was trained on 1.8 Million German language samples from various domains. This serves as a second method next to the lexicon approach and will be used to compare both methods. I modified the given code to also return the confidence logit values for each prediction. These are used to compare this and the lexicon methods.

In [None]:
class SentimentModel():
    def __init__(self, model_name: str):
        if torch.cuda.is_available():
            self.device = 'cuda'
        else:
            self.device = 'cpu'

        self.model = AutoModelForSequenceClassification.from_pretrained(model_name).to(self.device)
        self.tokenizer = AutoTokenizer.from_pretrained(model_name)

        self.clean_chars = re.compile(r'[^A-Za-züöäÖÜÄß ]', re.MULTILINE)
        self.clean_http_urls = re.compile(r'https*\\S+', re.MULTILINE)
        self.clean_at_mentions = re.compile(r'@\\S+', re.MULTILINE)

    def predict_sentiment(self, texts):
        if np.any(pd.isna(texts)):
          return np.nan
        texts = [self.clean_text(str(text)) for text in [texts]]
        # Add special tokens takes care of adding [CLS], [SEP], <s>... tokens in the right way for each model.
        encoded = self.tokenizer.batch_encode_plus(texts,padding=True, add_special_tokens=True,truncation=True, return_tensors="pt")
        encoded = encoded.to(self.device)
        with torch.no_grad():
                logits = self.model(**encoded)

        label_ids = torch.argmax(logits[0], axis=1)
        for a in zip(logits[0].tolist(), [self.model.config.id2label[label_id] for label_id in label_ids.tolist()]):
            tmp_rsl = list(a)
        return tmp_rsl

    def replace_numbers(self,text: str) -> str:
            return text.replace("0"," null").replace("1"," eins").replace("2"," zwei").replace("3"," drei").replace("4"," vier").replace("5"," fünf").replace("6"," sechs").replace("7"," sieben").replace("8"," acht").replace("9"," neun")

    def clean_text(self,text: str)-> str:
            text = text.replace("\n", " ")
            text = self.clean_http_urls.sub('',text)
            text = self.clean_at_mentions.sub('',text)
            text = self.replace_numbers(text)
            text = self.clean_chars.sub('', text) # use only text chars
            text = ' '.join(text.split()) # substitute multiple whitespace with single whitespace
            text = text.strip().lower()
            return text

In [None]:
model = SentimentModel(model_name = "oliverguhr/german-sentiment-bert")

In [None]:
tagesspiegel_pre['bert'] = tagesspiegel_pre['value'].progress_apply(model.predict_sentiment)

In [None]:
def bert_score(input):
    return np.ptp(sorted(list(map(abs, input[0]))))

def bert_label(input):
    return input[1]

In [None]:
tagesspiegel_pre['bert_score'] = tagesspiegel_pre['bert'].apply(bert_score)

In [None]:
tagesspiegel_pre['bert_label'] = tagesspiegel_pre['bert'].apply(bert_label)

In [None]:
rescale(tagesspiegel_pre['bert_score'])

In [None]:
tagesspiegel_bert = tagesspiegel_pre[['title', 'bert_score', 'bert_label']]

In [None]:
with mgzip.open(".../data/tagesspiegel_bert.mgzip", 'wb') as f:
    pickle.dump(tagesspiegel_bert, f)

## SSentiA comments (comparison of lexicon & BERT methods)

### Predicting labels
For this step, I compare the predictions of both prior methods and pick the ones with the highest confident value. Further, the high confident predictions will be used to train other ML models to improve low confidence predictions.

In [2]:
# load command
with mgzip.open('../data/tagesspiegel_lexi.mgzip', 'rb') as handle:
    tagesspiegel_lexi = pickle.load(handle)

In [3]:
# load command
with mgzip.open('../data/tagesspiegel_bert.mgzip', 'rb') as handle:
    tagesspiegel_bert = pickle.load(handle)

In [4]:
tagesspiegel_senti_compare = pd.DataFrame()

In [5]:
tagesspiegel_senti_compare[['title', 'comment', 'lexi_score','lexi_label']] = tagesspiegel_lexi[['title', 'value', 'lexi_score','lexi_label']]
tagesspiegel_senti_compare[['bert_score','bert_label']] = tagesspiegel_bert[['bert_score','bert_label']]

In [7]:
tagesspiegel_senti_compare['vote'] = np.where((tagesspiegel_senti_compare['lexi_score'] > tagesspiegel_senti_compare['bert_score']), 'lexi', 'bert')
tagesspiegel_senti_compare['vote_score'] = np.where((tagesspiegel_senti_compare['lexi_score'] > tagesspiegel_senti_compare['bert_score']), tagesspiegel_senti_compare['lexi_score'], tagesspiegel_senti_compare['bert_score'])
tagesspiegel_senti_compare['vote_label'] = np.where((tagesspiegel_senti_compare['lexi_score'] > tagesspiegel_senti_compare['bert_score']), tagesspiegel_senti_compare['lexi_label'], tagesspiegel_senti_compare['bert_label'])

In [8]:
tagesspiegel_senti_compare.drop(['lexi_score','lexi_label', 'bert_label', 'bert_score'], axis=1, inplace=True)

In [None]:
# saving to disk
with mgzip.open('../data/tagesspiegel_ssentia.mgzip', 'wb') as handle:
    pickle.dump(tagesspiegel_senti_compare, handle)

In [2]:
# load command
with mgzip.open('../data/tagesspiegel_ssentia.mgzip', 'rb') as handle:
    tagesspiegel_senti_compare = pickle.load(handle)

In [3]:
train = tagesspiegel_senti_compare[tagesspiegel_senti_compare['vote_score'] > tagesspiegel_senti_compare['vote_score'].mean() + 0.05 * tagesspiegel_senti_compare['vote_score'].std()]
test = tagesspiegel_senti_compare[tagesspiegel_senti_compare['vote_score'] < tagesspiegel_senti_compare['vote_score'].mean() + 0.05 * tagesspiegel_senti_compare['vote_score'].std()]
train.drop(['title', 'vote_score', 'vote'], axis=1, inplace=True)
test.drop(['title', 'vote_score', 'vote'], axis=1, inplace=True)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  return super().drop(


In [4]:
X_train = train.loc[:, train.columns != 'vote_label']
y_train = train['vote_label']
X_test = test.loc[:, test.columns != 'vote_label']
y_test = test['vote_label']

In [5]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import RandomizedSearchCV
from sklearn.model_selection import TimeSeriesSplit, GridSearchCV
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import LabelEncoder

In [6]:
encoder = LabelEncoder()
vectorizer = TfidfVectorizer(ngram_range=(0,1), tokenizer=lambda x: x.split())
X_train = vectorizer.fit_transform(X_train['comment'])
X_test = vectorizer.transform(X_test['comment'])
y_train = encoder.fit_transform(y_train)
y_test = encoder.transform(y_test)

In [49]:
# Number of trees in random forest
n_estimators = [int(x) for x in np.linspace(start = 10, stop = 1000, num = 10)]
# Criterion
criterion =['gini', 'entropy']
# Number of features to consider at every split
max_features = ['log2', 'sqrt', 'auto']
# Maximum number of levels in tree
max_depth = [int(x) for x in np.linspace(1, 100, num = 10)]
max_depth.append(None)
# Minimum number of samples required to split a node
min_samples_split = [2, 5, 10]
# Minimum number of samples required at each leaf node
min_samples_leaf = [1, 2, 4]
# Method of selecting samples for training each tree
bootstrap = [True, False]
# Create the random grid
random_grid = {'n_estimators': n_estimators,
               'criterion': criterion,
               'max_features': max_features,
               'max_depth': max_depth,
               'min_samples_split': min_samples_split,
               'min_samples_leaf': min_samples_leaf}

# Use the random grid to search for best hyperparameters
# First create the base model to tune
rf = RandomForestClassifier()
# Random search of parameters, using 3 fold cross validation,
# search across 100 different combinations, and use all available cores
rf_random = RandomizedSearchCV(estimator = rf, param_distributions = random_grid, n_iter = 5 , cv = 3, verbose = 3, random_state=42, n_jobs = -1)
# Fit the random search model
rf_random.fit(X_train,y_train)

rf_random.best_params_

Fitting 3 folds for each of 5 candidates, totalling 15 fits


KeyboardInterrupt: 

In [7]:
#regr = RandomForestClassifier(verbose=1, n_jobs=-1)
regr = RandomForestClassifier(verbose = 1, n_jobs = -1, n_estimators = 670, min_samples_split = 2, min_samples_leaf = 1, max_features = 'sqrt', max_depth = 89, criterion = 'entropy')
regr.fit(X_train,y_train)
regr.score(X_test,y_test)

[Parallel(n_jobs=-1)]: Using backend ThreadingBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done  34 tasks      | elapsed:    7.9s
[Parallel(n_jobs=-1)]: Done 184 tasks      | elapsed:   38.0s
[Parallel(n_jobs=-1)]: Done 434 tasks      | elapsed:  1.4min
[Parallel(n_jobs=-1)]: Done 670 out of 670 | elapsed:  2.2min finished
[Parallel(n_jobs=8)]: Using backend ThreadingBackend with 8 concurrent workers.
[Parallel(n_jobs=8)]: Done  34 tasks      | elapsed:    0.6s
[Parallel(n_jobs=8)]: Done 184 tasks      | elapsed:    2.8s
[Parallel(n_jobs=8)]: Done 434 tasks      | elapsed:    5.9s
[Parallel(n_jobs=8)]: Done 670 out of 670 | elapsed:    8.2s finished


0.1771346752209709

In [8]:
# predict labels for the whole data
X = tagesspiegel_senti_compare['comment']
X = vectorizer.transform(X)

In [9]:
tagesspiegel_senti_compare['ssentia_label'] = regr.predict(X)

[Parallel(n_jobs=8)]: Using backend ThreadingBackend with 8 concurrent workers.
[Parallel(n_jobs=8)]: Done  34 tasks      | elapsed:    0.6s
[Parallel(n_jobs=8)]: Done 184 tasks      | elapsed:    2.5s
[Parallel(n_jobs=8)]: Done 434 tasks      | elapsed:    5.9s
[Parallel(n_jobs=8)]: Done 670 out of 670 | elapsed:    9.8s finished


In [10]:
tagesspiegel_senti_compare['ssentia_label'] = np.where(tagesspiegel_senti_compare['ssentia_label'] == 2, 'positive', np.where(tagesspiegel_senti_compare['ssentia_label'] == 1, 'neutral', 'negative'))

### Finding most numerous mode

In [11]:
tagesspiegel_group = tagesspiegel_senti_compare[['title', 'vote_label', 'vote_score', 'ssentia_label']]

In [12]:
tagesspiegel_group['vote_value'] = np.where(tagesspiegel_group['vote_label'] == 'positive', 3, np.where(tagesspiegel_group['vote_label'] == 'neutral', 2, 1))

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  tagesspiegel_group['vote_value'] = np.where(tagesspiegel_group['vote_label'] == 'positive', 3, np.where(tagesspiegel_group['vote_label'] == 'neutral', 2, 1))


In [13]:
# general grouping
#tagesspiegel_group = tagesspiegel_group.groupby('title', as_index = False)[['vote_label', 'ssentia_label']].agg(vote=('vote_label', lambda x: pd.Series.mode(x)[0]), ssentia=('ssentia_label', lambda x: pd.Series.mode(x)[0]), count=('vote_label', np.count_nonzero))

In [14]:
# vote_value grouping with mean
tagesspiegel_group = tagesspiegel_group.groupby(['title', 'vote_label'], as_index = False)['vote_value', 'vote_label', 'vote_score'].agg(value = ('vote_value', np.mean),count=('vote_label', np.count_nonzero),conf_value=('vote_score', np.mean))

  tagesspiegel_group = tagesspiegel_group.groupby(['title', 'vote_label'], as_index = False)['vote_value', 'vote_label', 'vote_score'].agg(value = ('vote_value', np.mean),count=('vote_label', np.count_nonzero),conf_value=('vote_score', np.mean))


In [39]:
tagesspiegel_group

Unnamed: 0,title,vote_label,value,count,conf_value,log_count,anteil_intern,final,test
0,Deutsche Bank verweigert Auskunft zu Trump-Ge...,negative,1.0,1,0.541383,0.000000,0.500000,0.000000,0.000000
1,Deutsche Bank verweigert Auskunft zu Trump-Ge...,neutral,2.0,1,0.571705,0.000000,0.500000,0.000000,0.000000
2,"""500 Euro reichen nicht zum Leben""",negative,1.0,1,0.502691,0.000000,0.500000,0.000000,0.000000
3,"""500 Euro reichen nicht zum Leben""",positive,3.0,1,0.580092,0.000000,0.500000,0.000000,0.000000
4,"""Alarmstufe Rot"" für die Veranstaltungsbranche",negative,1.0,1,0.553421,0.000000,0.500000,0.000000,0.000000
...,...,...,...,...,...,...,...,...,...
21534,„Über unsere Arbeitsbedingungen will niemand m...,neutral,2.0,3,0.591422,1.098612,0.300000,0.603057,0.574094
21535,„Über unsere Arbeitsbedingungen will niemand m...,positive,3.0,3,0.634644,1.098612,0.300000,0.783105,0.574094
21536,"„Überlegen, ob er stellvertretender Ministerpr...",negative,1.0,7,0.537796,1.945910,0.636364,0.690239,1.112792
21537,"„Überlegen, ob er stellvertretender Ministerpr...",neutral,2.0,2,0.653186,0.693147,0.181818,0.302386,0.355002


In [40]:
# log of counts in order to model the amount of comments it got but not overemphasize outliers too much
tagesspiegel_group['log_count'] = np.log(tagesspiegel_group['count'])

In [41]:
tagesspiegel_group['anteil_intern'] = tagesspiegel_group['count'] / tagesspiegel_group.groupby('title')['count'].transform('sum')

In [42]:
tagesspiegel_group['test'] = np.sqrt(tagesspiegel_group['log_count']*tagesspiegel_group['anteil_intern'])

In [43]:
tagesspiegel_group['final'] = (np.log(tagesspiegel_group['conf_value'])*(-1))*tagesspiegel_group['value']*np.sqrt(tagesspiegel_group['log_count']*tagesspiegel_group['anteil_intern'])

In [44]:
tagesspiegel_group_value = tagesspiegel_group.groupby(['title'], as_index = False)['value', 'count', 'final'].agg(calc=('final', np.mean), value=('value', np.mean), count=('count', np.sum))

  tagesspiegel_group_value = tagesspiegel_group.groupby(['title'], as_index = False)['value', 'count', 'final'].agg(calc=('final', np.mean), value=('value', np.mean), count=('count', np.sum))


In [45]:
tagesspiegel_group_value

Unnamed: 0,title,calc,value,count
0,Deutsche Bank verweigert Auskunft zu Trump-Ge...,0.000000,1.5,2
1,"""500 Euro reichen nicht zum Leben""",0.000000,2.0,2
2,"""Alarmstufe Rot"" für die Veranstaltungsbranche",0.000000,1.5,2
3,"""An Willkommenskultur war nicht zu denken""",0.580165,2.0,14
4,"""Breite Teile der Bevölkerung werden in Not ge...",0.657748,2.0,9
...,...,...,...,...
9424,„Ältere und Kranke werden ihre Kontakte deutli...,0.996207,2.0,62
9425,„Ärzte können keinen Völkermord stoppen“,0.000000,2.0,1
9426,„Ökozid“ – Frankreich schafft neuen Straftatbe...,0.434294,2.0,8
9427,„Über unsere Arbeitsbedingungen will niemand m...,0.616820,2.0,10


In [46]:
# saving to disk
with mgzip.open('../data/tagesspiegel_predict_new.mgzip', 'wb') as handle:
    pickle.dump(tagesspiegel_group_value, handle)

In [None]:
from sklearn.svm import SVC

In [None]:
svc = SVC()
svc.fit(X_train,y_train)

In [None]:
svc.score(X_test,y_test)

In [377]:
param_grid = {'C': [0.1,1, 10, 100], 'gamma': [1,0.1,0.01,0.001],'kernel': ['rbf', 'poly', 'sigmoid']}

grid = GridSearchCV(SVC(),param_grid,refit=True,verbose=1, n_jobs=-1)
grid.fit(X_train,y_train)

Fitting 5 folds for each of 48 candidates, totalling 240 fits


KeyboardInterrupt: 