In [1]:
# taken from https://towardsdatascience.com/multi-class-text-classification-with-scikit-learn-12f1e60e0a9f

# Load data and prepare data-frame
# - strip unused columns 
# - add additional information
# - remove too short utterances

import pandas as pd
import sys
from os import path

sys.path.append(path.abspath('..'))
from webserver.classification import trainer
min_word_count = 2


data_path = '../webserver/model_data'
col = ['utterance', 'category']
categories_to_consider = ['praise', 'dissence', 'lecture', 'concession', 'insinuation']

td_pattern = 'TrainingData(Boris|Pelle)[0-9][0-9].tsv'
df = trainer.load_training_files(data_path, td_pattern, categories_to_consider) 

category_id_df = df[['category', 'category_id']].drop_duplicates().sort_values('category_id')
category_to_id = dict(category_id_df.values)
id_to_category = dict(category_id_df[['category_id', 'category']].values)

df.head()

Loading /Users/staude/Projekte/BorisJoens/SmallData/webserver/model_data/TrainingDataPelle02.tsv
Loading /Users/staude/Projekte/BorisJoens/SmallData/webserver/model_data/TrainingDataPelle01.tsv
Loading /Users/staude/Projekte/BorisJoens/SmallData/webserver/model_data/TrainingDataBoris01.tsv
Loading /Users/staude/Projekte/BorisJoens/SmallData/webserver/model_data/TrainingDataBoris02.tsv


Unnamed: 0,utterance,category,word_count,category_id
1,"Wer die Weakerthans nennt, darf Propagandhi ni...",lecture,9,2
3,Sorry schwerer Irrtum.,dissence,3,1
4,"Juhu, Petric der Held.",praise,4,0
5,das ist ja total ätzend.,dissence,5,1
7,Allein mit dem Track hat er die ganze Deutschr...,praise,14,0


In [None]:
# Analyze category distribution

import matplotlib.pyplot as plt
fig = plt.figure(figsize=(8,6))
df.groupby('category').utterance.count().plot.bar(ylim=0)
plt.title('Training distribution, n={}'.format(len(df)))
plt.show()

In [None]:
# Vectorization

from sklearn.feature_extraction.text import TfidfVectorizer
from stop_words import get_stop_words
import numpy as np

vectorizer_name = 'tfidf'
def vectorize(df, vectorizer_name):
    if vectorizer_name == 'tfidf':
        tfidf = TfidfVectorizer(sublinear_tf=True, min_df=2, norm='l2', encoding='latin-1', ngram_range=(1, 2))
        # , stop_words=get_stop_words('de'))
        features = tfidf.fit_transform(df.utterance).toarray()
    elif vectorizer_name == 'german_model':
        import sys, os
        sys.path.insert(0, os.path.abspath('..'))
        from webserver.classification import trainer
        sentences = df.utterance.to_list()
        categories = df.category_id.to_list()
        sentences_cleaned, features, labels = trainer.vectorize_corpus(sentences, trainer.sentence_to_vec_german_model, categories=categories, representation=trainer.REPRESENTATION_TYPE_WORD)
        print('Removed {} sentences'.format(len(sentences)-len(sentences_cleaned)))
        df = pd.DataFrame(np.array([sentences_cleaned, [id_to_category[ll] for ll in labels]]).T, columns = ['utterance', 'category'])
        df['category_id'] = labels
    return df, features

df, features = vectorize(df, vectorizer_name)
features.shape

In [None]:
# Analyze predictive features (works only for TFIDF)

from sklearn.feature_selection import chi2
import numpy as np
N = 2
features_chi2 = {}
feature_names = {}
tfidf = TfidfVectorizer(sublinear_tf=True, min_df=2, norm='l2', encoding='latin-1', ngram_range=(1, 2))
features = tfidf.fit_transform(df.utterance).toarray()

for Effekt, category_id in sorted(category_to_id.items()):
  raw_chi2 = chi2(features, df.category_id == category_id)
  indices = np.argsort(raw_chi2[0])[::-1]
  feature_names[category_id] = np.array(tfidf.get_feature_names())[indices]
  features_chi2[category_id] = raw_chi2[0][indices]
  unigrams = [v for v in feature_names[category_id] if len(v.split(' ')) == 1]
  bigrams = [v for v in feature_names[category_id] if len(v.split(' ')) == 2]
  print("# '{}':".format(Effekt))
  print("  - Most correlated unigrams:\n. {}".format('\n. '.join(unigrams[0:N])))
  print("  - Most correlated bigrams:\n. {}".format('\n. '.join(bigrams[0:N])))

In [None]:
# Show feature predictiveness of features

N = 20
for index, cat in id_to_category.items():
    fig = plt.figure(figsize=[20, 6])
    plt.bar(range(N), features_chi2[index][0:N])
    plt.xticks(range(N), feature_names[index][0:N])
    plt.title(cat)

In [None]:
# Show MultinomialNB-result for specific utterance

from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.naive_bayes import MultinomialNB

X_train, X_test, y_train, y_test = train_test_split(df['utterance'], df['category'], random_state = 0)
count_vect = CountVectorizer()
X_train_counts = count_vect.fit_transform(X_train)
tfidf_transformer = TfidfTransformer()
X_train_tfidf = tfidf_transformer.fit_transform(X_train_counts)
clf = MultinomialNB().fit(X_train_tfidf, y_train)
print(clf.predict(count_vect.transform(["Da musst du aber noch mal ran, das muss noch besser werden!"])))

In [None]:
# Compare accuracies of different models

from sklearn.linear_model import LogisticRegression, SGDClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import LinearSVC
from sklearn.naive_bayes import MultinomialNB
from sklearn.model_selection import cross_val_score
models = [
    RandomForestClassifier(n_estimators=200, max_depth=3, random_state=0),
    LinearSVC(),
    SGDClassifier(tol=1e-3, max_iter=1000, loss='modified_huber'),
    MultinomialNB(),
    LogisticRegression(random_state=0),
]

CV = 5
cv_df = pd.DataFrame(index=range(CV * len(models)))
entries = []
for model in models:
  model_name = model.__class__.__name__
  accuracies = cross_val_score(model, features, df['category_id'], scoring='accuracy', cv=CV)
  for fold_idx, accuracy in enumerate(accuracies):
    entries.append((model_name, fold_idx, accuracy))
cv_df = pd.DataFrame(entries, columns=['model_name', 'fold_idx', 'accuracy'])
import seaborn as sns
sns.boxplot(x='model_name', y='accuracy', data=cv_df)
sns.stripplot(x='model_name', y='accuracy', data=cv_df, 
              size=8, jitter=True, edgecolor="gray", linewidth=2)
plt.show()
cv_df.groupby('model_name').accuracy.mean()

In [None]:
## BUILD CONFUSION MATRIX WITH TRAIN/SPLIT

from sklearn.linear_model import SGDClassifier
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
model = SGDClassifier(tol=1e-3, max_iter=1000, random_state=0, loss='modified_huber')
# model = MultinomialNB()
X_train, X_test, y_train, y_test, indices_train, indices_test = train_test_split(features, df.category_id, range(len(df.category_id)), test_size=0.33, random_state=0)
model.fit(X_train, y_train)
y_pred = model.predict(X_test)
from sklearn.metrics import confusion_matrix
conf_mat = confusion_matrix(y_test, y_pred)
fig, ax = plt.subplots(figsize=(10,10))
sns.heatmap(conf_mat, annot=True, fmt='d',
            xticklabels=category_id_df.category.values, yticklabels=category_id_df.category.values)
plt.ylabel('Actual')
plt.xlabel('Predicted')
plt.show()

In [None]:
## SHOW WRONG SENTENCES WITH TRAIN/SPLIT

from IPython.display import display
import numpy as np

for predicted in category_id_df.category_id:
  for actual in category_id_df.category_id:
    if predicted != actual and conf_mat[actual, predicted] >= 10:
        selected_indices = y_test[(y_test == actual) & (y_pred == predicted)]
        print("'{}' predicted as '{}' : {} examples.".format(id_to_category[actual], id_to_category[predicted], conf_mat[actual, predicted]))
        display(df.loc[selected_indices.index][['category', 'utterance']])
        print('')

In [None]:
## BUILD CONFUSION MATRIX WITH DEDICATED TEST-SET

from sklearn.linear_model import SGDClassifier
from sklearn.naive_bayes import MultinomialNB


import seaborn as sns
import matplotlib.pyplot as plt
# model = SGDClassifier(tol=1e-3, max_iter=1000, random_state=0, loss='modified_huber')
model = MultinomialNB()

df_test = trainer.load_training_files(data_path, 'TrainingDataTestSet01.tsv', categories_to_consider)
# 
model.fit(features, df.category_id)

if vectorizer_name == 'tfidf':
    tfidf = TfidfVectorizer(sublinear_tf=True, min_df=2, norm='l2', encoding='latin-1', ngram_range=(1, 2))
        # , stop_words=get_stop_words('de'))
    tfidf.fit(df.utterance)
    features_test = tfidf.transform(df_test.utterance)
else:
    df_test, features_test = vectorize(df_test, vectorizer_name)



y_pred = model.predict(features_test)
from sklearn.metrics import confusion_matrix
conf_mat = confusion_matrix(df_test.category_id, y_pred)
fig, ax = plt.subplots(figsize=(10,10))
sns.heatmap(conf_mat, annot=True, fmt='d',
            xticklabels=category_id_df.category.values, yticklabels=category_id_df.category.values)
plt.ylabel('Actual')
plt.xlabel('Predicted')
plt.show()

In [None]:
## SHOW WRONG SENTENCES WITH DEDICATED TEST SET

from IPython.display import display
import numpy as np

for predicted in category_id_df.category_id:
  for actual in category_id_df.category_id:
    if predicted != actual and conf_mat[actual, predicted] >= 1:
        selected_indices = df_test.category_id[(df_test.category_id == actual) & (y_pred == predicted)]
        print("'{}' predicted as '{}' : {} examples:".format(id_to_category[actual], id_to_category[predicted], conf_mat[actual, predicted]))
        for i, row in df_test.loc[selected_indices.index].iterrows():
             print(row['utterance'])
#         display(df_test.loc[selected_indices.index][['category', 'utterance']])
        print('')

In [None]:
## Check doc2vec
# taken from https://medium.com/towards-artificial-intelligence/text-classification-by-xgboost-others-a-case-study-using-bbc-news-articles-5d88e94a9f8
import pandas as pd

file = '../webserver/model_data/TrainingData_ml.xlsx'
df = pd.read_excel(file).rename(columns={'Effekt': 'category'})
col = ['utterance', 'category']
df = df[col]

from gensim import utils
import gensim.parsing.preprocessing as gsp

filters = [
           gsp.strip_tags, 
           gsp.strip_punctuation,
           gsp.strip_multiple_whitespaces,
           gsp.strip_numeric,
           gsp.strip_short, 
          ]

def clean_text(s):
    s = s.lower()
    s = utils.to_unicode(s)
    for f in filters:
        s = f(s)
    return s

In [None]:
from wordcloud import WordCloud
import matplotlib.pyplot as plt
%matplotlib inline



def plot_word_cloud(text):
    wordcloud_instance = WordCloud(width = 800, height = 800, 
                background_color ='black', 
                stopwords=None,
                min_font_size = 10).generate(text) 
             
    plt.figure(figsize = (8, 8), facecolor = None) 
    plt.imshow(wordcloud_instance) 
    plt.axis("off") 
    plt.tight_layout(pad = 0) 
    plt.show()

    
texts = ''
for index, item in df.iterrows():
    texts = texts + ' ' + clean_text(item['utterance'])
    
plot_word_cloud(texts)

In [None]:
def plot_word_cloud_for_category(df, category):
    text_df = df.loc[df['category'] == str(category)]
    texts = ''
    for index, item in text_df.iterrows():
        texts = texts + ' ' + clean_text(item['utterance'])
    
    plot_word_cloud(texts)

In [None]:
###.  TEST GENSIM

import sys, os
sys.path.insert(0, os.path.abspath('..'))
from webserver.classification import trainer
import numpy as np


np.linalg.norm(trainer.sentence_to_vec_german_model('Berlin') - trainer.sentence_to_vec_german_model('Paris'))
        

In [None]:
trainer.model.