In [1]:
# taken from https://towardsdatascience.com/multi-class-text-classification-with-scikit-learn-12f1e60e0a9f

# Load data and prepare data-frame
# - strip unused columns 
# - add additional information
# - remove too short utterances

import pandas as pd
min_word_count = 1
categories_to_consider = ['praise', 'dissence', 'lecture', 'concession', 'insinuation']

file = '../webserver/model_data/TrainingData_ml.xlsx'
col = ['utterance', 'category']
df = pd.read_excel(file).rename(columns={'Effekt': 'category'})

df = df[col]
df = df[pd.notnull(df['utterance'])]
df.columns = ['utterance', 'category']

df = df[df['category'].isin(categories_to_consider)]         


df['category_id'] = df['category'].factorize()[0]
df['word_count'] = list(map(lambda x:len(x.split(' ')), df['utterance']))
category_id_df = df[['category', 'category_id']].drop_duplicates().sort_values('category_id')
category_to_id = dict(category_id_df.values)
id_to_category = dict(category_id_df[['category_id', 'category']].values)
# filter for minimal word count
df = df[df['word_count']>=min_word_count]

df.head()

Unnamed: 0,utterance,category,category_id,word_count
0,Merkel war und ist hier eine getriebene,insinuation,0,7
1,Das ist doch albern.,lecture,1,4
2,Dass ich nicht lache,dissence,2,4
3,Sie haben schlicht gar nichts begriffen.,lecture,1,6
4,Stellen Sie sich nicht dumm,lecture,1,5


In [2]:
# Analyze category distribution

import matplotlib.pyplot as plt
fig = plt.figure(figsize=(8,6))
df.groupby('category').utterance.count().plot.bar(ylim=0)
plt.title('Training distribution, n={}'.format(len(df)))
plt.show()

<Figure size 800x600 with 1 Axes>

In [3]:
# Vectorization

from sklearn.feature_extraction.text import TfidfVectorizer
from stop_words import get_stop_words

vectorizer = 'german_model'
if vectorizer == 'tfifd':
    tfidf = TfidfVectorizer(sublinear_tf=True, min_df=2, norm='l2', encoding='latin-1', ngram_range=(1, 2))
    # , stop_words=get_stop_words('de')
    features = tfidf.fit_transform(df.utterance).toarray()
    labels = df.category_id
elif vectorizer == 'german_model':
    import sys, os
    sys.path.insert(0, os.path.abspath('..'))
    from webserver.classification import trainer
    sentences, categories = trainer.read_trainingdata_utterances(df)
    features, labels = trainer.vectorize_corpus(sentences, trainer.sentence_to_vec_german_model, categories=categories)
features.shape

ModuleNotFoundError: No module named 'stop_words'

In [None]:
# Analyze predictive features

from sklearn.feature_selection import chi2
import numpy as np
N = 2
features_chi2 = {}
feature_names = {}

for Effekt, category_id in sorted(category_to_id.items()):
  raw_chi2 = chi2(features, labels == category_id)
  indices = np.argsort(raw_chi2[0])[::-1]
  feature_names[category_id] = np.array(tfidf.get_feature_names())[indices]
  features_chi2[category_id] = raw_chi2[0][indices]
  unigrams = [v for v in feature_names[category_id] if len(v.split(' ')) == 1]
  bigrams = [v for v in feature_names[category_id] if len(v.split(' ')) == 2]
  print("# '{}':".format(Effekt))
  print("  . Most correlated unigrams:\n. {}".format('\n. '.join(unigrams[0:N])))
  print("  . Most correlated bigrams:\n. {}".format('\n. '.join(bigrams[0:N])))

In [None]:
N = 20
for index, cat in id_to_category.items():
    fig = plt.figure(figsize=[20, 6])
    plt.bar(range(N), features_chi2[index][0:N])
    plt.xticks(range(N), feature_names[index][0:N])
    plt.title(cat)

In [None]:
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.naive_bayes import MultinomialNB

X_train, X_test, y_train, y_test = train_test_split(df['utterance'], df['category'], random_state = 0)
count_vect = CountVectorizer()
X_train_counts = count_vect.fit_transform(X_train)
tfidf_transformer = TfidfTransformer()
X_train_tfidf = tfidf_transformer.fit_transform(X_train_counts)
clf = MultinomialNB().fit(X_train_tfidf, y_train)

In [None]:
print(clf.predict(count_vect.transform(["Da musst du aber noch mal ran, das muss noch besser werden!"])))

In [None]:
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import LinearSVC
from sklearn.naive_bayes import MultinomialNB
from sklearn.model_selection import cross_val_score
models = [
    RandomForestClassifier(n_estimators=200, max_depth=3, random_state=0),
    LinearSVC(),
    SGDClassifier(tol=1e-3, max_iter=1000, loss='modified_huber'),
    MultinomialNB(),
    LogisticRegression(random_state=0),
]

CV = 5
cv_df = pd.DataFrame(index=range(CV * len(models)))
entries = []
for model in models:
  model_name = model.__class__.__name__
  accuracies = cross_val_score(model, features, labels, scoring='accuracy', cv=CV)
  for fold_idx, accuracy in enumerate(accuracies):
    entries.append((model_name, fold_idx, accuracy))
cv_df = pd.DataFrame(entries, columns=['model_name', 'fold_idx', 'accuracy'])
import seaborn as sns
sns.boxplot(x='model_name', y='accuracy', data=cv_df)
sns.stripplot(x='model_name', y='accuracy', data=cv_df, 
              size=8, jitter=True, edgecolor="gray", linewidth=2)
plt.show()
cv_df.groupby('model_name').accuracy.mean()

In [None]:
from sklearn.linear_model import SGDClassifier
from sklearn.model_selection import train_test_split
model = SGDClassifier(tol=1e-3, max_iter=1000, random_state=0, loss='modified_huber')
model = MultinomialNB()
X_train, X_test, y_train, y_test, indices_train, indices_test = train_test_split(features, labels, range(len(labels)), test_size=0.33, random_state=0)
model.fit(X_train, y_train)
y_pred = model.predict(X_test)
from sklearn.metrics import confusion_matrix
conf_mat = confusion_matrix(y_test, y_pred)
fig, ax = plt.subplots(figsize=(10,10))
sns.heatmap(conf_mat, annot=True, fmt='d',
            xticklabels=category_id_df.category.values, yticklabels=category_id_df.category.values)
plt.ylabel('Actual')
plt.xlabel('Predicted')
plt.show()

In [None]:
from IPython.display import display
for predicted in category_id_df.category_id:
  for actual in category_id_df.category_id:
    if predicted != actual and conf_mat[actual, predicted] >= 10:
      print("'{}' predicted as '{}' : {} examples.".format(id_to_category[actual], id_to_category[predicted], conf_mat[actual, predicted]))
      display(df.loc[indices_test[(y_test == actual) & (y_pred == predicted)]][['category', 'utterance']])
      print('')

In [None]:
## Check doc2vec
# taken from https://medium.com/towards-artificial-intelligence/text-classification-by-xgboost-others-a-case-study-using-bbc-news-articles-5d88e94a9f8
import pandas as pd

file = '../webserver/model_data/TrainingData_ml.xlsx'
df = pd.read_excel(file).rename(columns={'Effekt': 'category'})
col = ['utterance', 'category']
df = df[col]

from gensim import utils
import gensim.parsing.preprocessing as gsp

filters = [
           gsp.strip_tags, 
           gsp.strip_punctuation,
           gsp.strip_multiple_whitespaces,
           gsp.strip_numeric,
           gsp.strip_short, 
          ]

def clean_text(s):
    s = s.lower()
    s = utils.to_unicode(s)
    for f in filters:
        s = f(s)
    return s

In [None]:
from wordcloud import WordCloud
import matplotlib.pyplot as plt
%matplotlib inline



def plot_word_cloud(text):
    wordcloud_instance = WordCloud(width = 800, height = 800, 
                background_color ='black', 
                stopwords=None,
                min_font_size = 10).generate(text) 
             
    plt.figure(figsize = (8, 8), facecolor = None) 
    plt.imshow(wordcloud_instance) 
    plt.axis("off") 
    plt.tight_layout(pad = 0) 
    plt.show()

    
texts = ''
for index, item in df.iterrows():
    texts = texts + ' ' + clean_text(item['utterance'])
    
plot_word_cloud(texts)

In [None]:
def plot_word_cloud_for_category(df, category):
    text_df = df.loc[df['category'] == str(category)]
    texts = ''
    for index, item in text_df.iterrows():
        texts = texts + ' ' + clean_text(item['utterance'])
    
    plot_word_cloud(texts)

In [None]:
plot_word_cloud_for_category(df,'praise')