In [None]:
import pandas as pd
import numpy as np
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.metrics import confusion_matrix
from sklearn.model_selection import train_test_split
import matplotlib.pyplot  as plt
from sklearn.linear_model import SGDClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import RandomizedSearchCV
from sklearn.linear_model import LogisticRegressionCV
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import AdaBoostClassifier
from sklearn.ensemble import GradientBoostingClassifier

import matplotlib as mpl
mpl.rc('font',family='Segoe UI Emoji')

from sklearn import metrics
import itertools

import nltk
nltk.download('twitter_samples')
nltk.download('stopwords')

In [None]:
import string
import re
 
from nltk.corpus import stopwords 
stopwords_german = stopwords.words('german')
 
from nltk.stem.snowball import SnowballStemmer
stemmer = SnowballStemmer('german')
 
from nltk.tokenize import TweetTokenizer
 
# Happy Emoticons
emoticons_happy = set([
    ':-)', ':)', ';)', ':o)', ':]', ':3', ':c)', ':>', '=]', '8)', '=)', ':}',
    ':^)', ':-D', ':D', '8-D', '8D', 'x-D', 'xD', 'X-D', 'XD', '=-D', '=D',
    '=-3', '=3', ':-))', ":'-)", ":')", ':*', ':^*', '>:P', ':-P', ':P', 'X-P',
    'x-p', 'xp', 'XP', ':-p', ':p', '=p', ':-b', ':b', '>:)', '>;)', '>:-)',
    '<3'
    ])
 
# Sad Emoticons
emoticons_sad = set([
    ':L', ':-/', '>:/', ':S', '>:[', ':@', ':-(', ':[', ':-||', '=L', ':<',
    ':-[', ':-<', '=\\', '=/', '>:(', ':(', '>.<', ":'-(", ":'(", ':\\', ':-c',
    ':c', ':{', '>:\\', ';('
    ])
 
# all emoticons (happy + sad)
emoticons = emoticons_happy.union(emoticons_sad)
 
def clean_tweets(tweet):
    # remove stock market tickers like $GE
    tweet = re.sub(r'\$\w*', '', tweet)
 
    # remove old style retweet text "RT"
    tweet = re.sub(r'^RT[\s]+', '', tweet)
 
    # remove hyperlinks
    tweet = re.sub(r'https?:\/\/[^\s]*', '', tweet)
    
    # remove hashtags
    # only removing the hash # sign from the word
    tweet = re.sub(r'#', '', tweet)
    
    # replace years with 'ayearzzz'-Token
    tweet = re.sub(r'([1-2][0-9]{3})', r'ayearzzz', tweet)
    
    # replace numbers with 'anumberzzz'-Token, only numbers outside of words
    tweet = re.sub(r'(?<![0-9a-zA-Z])[0-9]+(?![0-9a-zA-Z])', r'anumberzzz', tweet)
 
    # tokenize tweets
    tokenizer = TweetTokenizer(preserve_case=False, strip_handles=True, reduce_len=True)
    tweet_tokens = tokenizer.tokenize(tweet)
 
    tweets_clean = []    
    for word in tweet_tokens:
        if (word not in stopwords_german and # remove stopwords
              word not in emoticons and # remove emoticons
                word not in string.punctuation): # remove punctuation
            #tweets_clean.append(word)
            stem_word = stemmer.stem(word) # stemming word
            tweets_clean.append(stem_word)
    tweets_clean=" ".join(tweets_clean)
    
    # remove numbers that were pulled out of words by tokenizer
    tweets_clean = re.sub(r'(?<![0-9a-zA-Z])[0-9]+(?![0-9a-zA-Z])', r'', tweets_clean)
    
    return tweets_clean
 
custom_tweet = "RT @Twitter @chapagain Hello There! Have a great day. :) #good #morning http://chapagain.com.np"
 
# print cleaned tweet
print (clean_tweets(custom_tweet))

In [None]:
def plot_confusion_matrix(cm, classes,
                          normalize=False,
                          title='Confusion matrix',
                          cmap=plt.cm.Blues):
    """
    This function prints and plots the confusion matrix.
    Normalization can be applied by setting `normalize=True`.
    """
    if normalize:
        cm = cm.astype('float') / cm.sum(axis=1)[:, np.newaxis]
        print("Normalized confusion matrix")
    else:
        print('Confusion matrix, without normalization')

    print(cm)

    plt.imshow(cm, interpolation='nearest', cmap=cmap)
    plt.title(title)
    plt.colorbar()
    tick_marks = np.arange(len(classes))
    plt.xticks(tick_marks, classes, rotation=45)
    plt.yticks(tick_marks, classes)

    fmt = '.2f' if normalize else 'd'
    thresh = cm.max() / 2.
    for i, j in itertools.product(range(cm.shape[0]), range(cm.shape[1])):
        plt.text(j, i, format(cm[i, j], fmt),
                 horizontalalignment="center",
                 color="white" if cm[i, j] > thresh else "black")

    plt.ylabel('True label')
    plt.xlabel('Predicted label')
    plt.tight_layout()

In [None]:
def get_most_important_features(vectorizer, model, n=5):
    index_to_word = {v:k for k,v in vectorizer.vocabulary_.items()}
    
    # loop for each class
    classes ={}
    for class_index in range(model.coef_.shape[0]):
        word_importances = [(el, index_to_word[i]) for i,el in enumerate(model.coef_[class_index])]
        sorted_coeff = sorted(word_importances, key = lambda x : x[0], reverse=True)
        tops = sorted(sorted_coeff[:n], key = lambda x : x[0])
        bottom = sorted_coeff[-n:]
        classes[class_index] = {
            'tops':tops,
            'bottom':bottom
        }
    return classes

In [None]:
def plot_important_words_binary(importance, labels, name):
    top_scores = [a[0] for a in importance[0]['tops']]
    top_words = [a[1] for a in importance[0]['tops']]
    
    bottom_scores = [a[0] for a in importance[0]['bottom']]
    bottom_words = [a[1] for a in importance[0]['bottom']]
    
    fig = plt.figure(figsize=(10, 10))  
    y_pos = np.arange(len(top_words))
        
    plt.subplot(121)
    plt.barh(y_pos,bottom_scores, align='center', alpha=0.5)
    plt.title(labels[0], fontsize=20)
    plt.yticks(y_pos, bottom_words, fontsize=14)
    plt.suptitle('Key words', fontsize=16)
    plt.xlabel('Importance', fontsize=20)
    
    plt.subplot(122)
    plt.barh(y_pos,top_scores, align='center', alpha=0.5)
    plt.title(labels[1], fontsize=20)
    plt.yticks(y_pos, top_words, fontsize=14)
    plt.suptitle(name, fontsize=16)
    plt.xlabel('Importance', fontsize=20)
    
    plt.subplots_adjust(wspace=0.8)

In [None]:
def plot_important_words_multi_class(importance, class_labels, name):
    fig = plt.figure(figsize=(10,10))
    for i in range(len(importance)):
        top_scores = [a[0] for a in importance[i]['tops']]
        top_words = [a[1] for a in importance[i]['tops']]
        
        y_pos = np.arange(len(top_words))
        top_pairs = [(a,b) for a,b in zip(top_words, top_scores)]
        top_pairs = sorted(top_pairs, key=lambda x: x[1])
    
        top_words = [a[0] for a in top_pairs]
        top_scores = [a[1] for a in top_pairs]
        
        subplot = str(int(len(importance)/2)+1)+str(2)+str(i + 1)
        plt.subplot(int(len(importance)/2)+1, 2, i + 1)
        plt.barh(y_pos,top_scores, align='center', alpha=0.5)
        plt.title(class_labels[i], fontsize=20, fontname='Segoe UI Emoji')
        plt.yticks(y_pos, top_words, fontsize=14)
        plt.suptitle(name, fontsize=16)
        plt.xlabel('Importance', fontsize=14)

    plt.subplots_adjust(wspace=0.8, hspace=0.6)

In [None]:
from sklearn.decomposition import PCA, TruncatedSVD
import matplotlib.patches as mpatches

def plot_LSA(test_data, test_labels, savepath="PCA_demo.csv", plot=True):
        lsa = TruncatedSVD(n_components=2)
        lsa.fit(test_data)
        lsa_scores = lsa.transform(test_data)
        label_names = set(test_labels)
        color_mapper = {label:idx for idx,label in enumerate(label_names)}
        color_column = [color_mapper[label] for label in test_labels]
        colors = ['orange','blue']
        if plot:
            plt.scatter(lsa_scores[:,0], lsa_scores[:,1], s=8, alpha=.8, c=color_column, cmap=mpl.colors.ListedColormap(colors))
            red_patch = mpatches.Patch(color='orange', label=list(label_names)[0])
            green_patch = mpatches.Patch(color='blue', label=list(label_names)[1])
            plt.legend(handles=[red_patch, green_patch], prop={'size': 30})


End of definitions

------------------

Start data preparation



In [None]:
importdf=pd.read_csv('../data/trainingssets/all_emoji_tweets_03_12_18_7_labels_excluded.csv', sep =';', usecols=['tweet_full_text', 'target'])
importdf.dropna(inplace=True)
importdf.reset_index(inplace=True, drop=True)

In [None]:
all_targets = importdf['target'].astype(str).values.tolist()

In [None]:
y=[]
for i in range(len(all_targets)):
    #Only use first emoji per tweet for now
    y.append(all_targets[i].split(',')[0])

# for filtering in conversion to binary classification later on
dfy=pd.DataFrame(y)
dfx=pd.DataFrame(importdf['tweet_full_text'])
dfx.columns = range(dfx.shape[1])

In [None]:
# convert to binary classification
binary_labels=['♥️', '😂']   # two Labels chosen for binary classification

dfy=dfy[dfy.isin(binary_labels)]
dfy.dropna(inplace=True)
dfx=dfx[dfy.isin(binary_labels)]
dfx.dropna(inplace=True)

df=dfx.copy()
df.rename(inplace=True, columns={0: "tweet"})
df['target'] = dfy

# balance classes to 50:50 by dropping appropriate (randomized) fraction of majority class
majority_class='♥️'
class_freq=df['target'].value_counts()
df = df.drop(df[df['target'] == majority_class].sample(frac=(1-class_freq[1]/class_freq[0]), random_state=123).index)

# prepare data for following steps
our_tweets=df['tweet'].astype(str).values.tolist()
y=df['target']
y=np.array(y)

In [None]:
corpus=[]
for i in range(len(our_tweets)):
    corpus.append(clean_tweets(our_tweets[i]))
corpus

End Data preparation 

-----------------------------

Start data visualisation

In [None]:
X_train, X_test, y_train, y_test = train_test_split(corpus, y, test_size=0.4, random_state=0)
pipe_transformer = Pipeline([
    ('vect', CountVectorizer(max_df=0.9, min_df=5, ngram_range=(1,2))),
])
pipe_transformer.fit(X_train, y_train);

In [None]:
fig = plt.figure(figsize=(16, 16))          
plot_LSA(pipe_transformer.transform(X_train), y_train)
plt.savefig('../figures/tweets_from_03_12_18/binaryclass/countvectorizer_lsa')
plt.show()

In [None]:
X_train, X_test, y_train, y_test = train_test_split(corpus, y, test_size=0.4, random_state=0)
pipe_transformer = Pipeline([
    ('vect', CountVectorizer(max_df=0.9, min_df=5, ngram_range=(1,2))),
    ('tfidf', TfidfTransformer(use_idf=True)),
])
pipe_transformer.fit(X_train, y_train);

In [None]:
fig = plt.figure(figsize=(16, 16))          
plot_LSA(pipe_transformer.transform(X_train), y_train)
plt.savefig('../figures/tweets_from_03_12_18/binaryclass/tfidf_lsa')
plt.show()

In [None]:
# For use after arbitrary GridSearch
# Needs to be run twice to work? Probably some mistake here
#fig = plt.figure(figsize=(16, 16))          
#clf = gs_clf.best_estimator_.steps.pop(1)
#plot_LSA(gs_clf.best_estimator_.transform(X_train), y_train)
#gs_clf.best_estimator_.steps.append(clf)
#plt.show()

End data visualisation

-----------------------

Start simple MultinomialNB

In [None]:
X_train, X_test, y_train, y_test = train_test_split(corpus, y, test_size=0.4, random_state=0)
pipe_clf = Pipeline([
    ('vect', CountVectorizer(min_df=5)),
    ('clf', MultinomialNB()),
])
pipe_clf.fit(X_train, y_train)
predicted = pipe_clf.predict(X_test)
np.mean(predicted == y_test)  

In [None]:
plt.figure()
cnf_matrix = confusion_matrix(y_test, predicted)
plot_confusion_matrix(cnf_matrix, classes=binary_labels, normalize=True,
                      title='Confusion matrix, with normalization')
plt.savefig('../figures/tweets_from_03_12_18/binaryclass/countvectorizer_multinomialnb_confusion_matrix')
plt.show()

In [None]:
plt.figure()
importance = get_most_important_features(pipe_clf.get_params()['vect'], pipe_clf.get_params()['clf'], 10)
plot_important_words_binary(importance, pipe_clf.get_params()['clf'].classes_, "Most important words")
plt.savefig('../figures/tweets_from_03_12_18/binaryclass/countvectorizer_multinomialnb_feature_importance')
plt.show()

End Simple MultinomialNB

----------------------------

AB HIER ZELLEN SELEKTIV AUSFÜHREN. Das Trainieren einiger Modelle nimmt enorm viel Zeit in Anspruch.

Start Advanced MultinomialNB

In [None]:
X_train, X_test, y_train, y_test = train_test_split(corpus, y, test_size=0.4, random_state=0)
pipe_clf = Pipeline([
    ('vect', CountVectorizer(max_df=0.9, min_df=5, ngram_range=(1,2))),
    ('tfidf', TfidfTransformer(use_idf=False)),
    ('clf', MultinomialNB(alpha=0.1))
])

parameters = {
     #'vect__min_df': (40,60),
     #'vect__max_df': (0.8,0.81),
}
gs_clf = GridSearchCV(pipe_clf, parameters, cv=3, iid=False, n_jobs=-1)
gs_clf.fit(X_train, y_train)
predicted = gs_clf.predict(X_test)
np.mean(predicted == y_test)  

Test runs (File from 22.11.18):

Run1: 
     * min_df = 60,
     * 'vect__ngram_range': [(1, 1), (1, 2)],
     * 'tfidf__use_idf': (True, False),
     * 'clf__alpha': (0, 0.1, 1),
     
Score: 0.73, ngram_range=(1,1), use_idf=False, alpha=0.1

Run2:
     * alpha=0.1
     * 'vect__ngram_range': [(1, 1), (1, 2)],
     * 'vect__min_df': (1, 20, 0.1),
     * 'vect__max_df': (0.8, 0.95),
     * 'tfidf__use_idf': (True, False)

Score: 0.7741, ngram_range=(1,2), use_idf=False, max_df=0.8, min_df=1

Run3:
     * alpha=0.1
     * ngram_range=(1, 2),
     * min_df=1,
     * 'vect__max_df': (0.5,0.6,0.7,0.8),
     * use_idf=False

Score: 0.7741, max_df=0.5

Run4:
     * alpha=0.1
     * ngram_range=(1, 2),
     * 'vect__min_df': (60, 100, 0.01, 0.1),
     * 'vect__max_df': (0.5, 0.8),
     * use_idf=False

Score: 0.733, max_df=0.5, min_df=60


In [None]:
for param_name in sorted(parameters.keys()):
    print("%s: %r" % (param_name, gs_clf.best_params_[param_name]))

In [None]:
plt.figure()
cnf_matrix = confusion_matrix(y_test, predicted)
plot_confusion_matrix(cnf_matrix, classes=binary_labels, normalize=True,
                      title='Confusion matrix, with normalization')
plt.savefig('../figures/tweets_from_03_12_18/binaryclass/tfidftransformer_multinomialnb_confusion_matrix')
plt.show()

In [None]:
plt.figure()
importance = get_most_important_features(gs_clf.best_estimator_.get_params()['vect'], gs_clf.best_estimator_.get_params()['clf'], 10)
plot_important_words_binary(importance, gs_clf.best_estimator_.get_params()['clf'].classes_, "Most important words")
plt.savefig('../figures/tweets_from_03_12_18/binaryclass/tfidftransformer_multinomialnb_feature_importances')
plt.show()

End Advanced MultinomialNB

----------------------------

Start Advanced SGDClassifier

In [None]:
X_train, X_test, y_train, y_test = train_test_split(corpus, y, test_size=0.4, random_state=0)
pipe_clf = Pipeline([
    ('vect', CountVectorizer(max_df=0.9, min_df=5, ngram_range=(1,2))),
    ('tfidf', TfidfTransformer(use_idf=True)),
    ('clf', SGDClassifier(random_state=0, 
                          max_iter=100, tol=None))
])

parameters = {
     'clf__loss': ['modified_huber', 'log'],
     'clf__penalty': ['elasticnet', 'l2'],
     'clf__alpha': [1e-5],
     'clf__epsilon': [0.01],
     'clf__learning_rate': ['invscaling', 'optimal'],
     'clf__eta0': [10]
     #'clf__eta0': [1e-4, 0.1],
}
gs_clf = GridSearchCV(pipe_clf, parameters, cv=3, iid=False, n_jobs=-1)
gs_clf.fit(X_train, y_train)
predicted = gs_clf.predict(X_test)
np.mean(predicted == y_test)  

Test runs (File from 29.11.18):

Run1:
     * min_df = 1,
     * max_df = 0.9
     * max_iter = 100
     * tol = None
     * ngram_range=(1,2)
     * use_idf = True,
     * 'clf__loss': ['modified_huber'],
     * 'clf__penalty': ['elasticnet'],
     * 'clf__alpha': [1e-5],
     * 'clf__epsilon': [0.01],
     * 'clf__learning_rate': ['invscaling'],
     * 'clf__eta0': [10]
     
Score: 0.786900

Test runs (File from 22.11.18):

Run1: 
     * min_df = 1,
     * max_df = 0.9
     * max_iter = 3
     * tol = None
     * ngram_range=(1,2)
     * 'tfidf__use_idf': (True, False),
     * 'clf__loss': ['hinge', 'log'],
     * 'clf__penalty': ['l2', 'l1'],
     * 'clf__alpha': [1e-4, 0.1],
     
Score: 0.74145 clf__alpha: 0.0001, loss: 'hinge', penalty: 'l2', use_idf: True

Run2: 
     * min_df = 1,
     * max_df = 0.9
     * max_iter = 1000
     * tol = None
     * ngram_range=(1,2)
     * use_idf = True,
     * loss = 'log',
     * penalty = 'l2',
     * alpha = 1e-4
     
Score: 0.7378

Run3: 
     * min_df = 1,
     * max_df = 0.9
     * max_iter = 100
     * tol = None
     * ngram_range=(1,2)
     * use_idf = True,
     * loss = 'log',
     * penalty = 'l2',
     * 'clf__alpha': [1e-3,1e-2,0.1,1,10]
     
Score: 0.60, alpha=1e-3

Run4: 
     * min_df = 1,
     * max_df = 0.9
     * max_iter = 100
     * tol = None
     * ngram_range=(1,2)
     * use_idf = True,
     * loss = 'log',
     * penalty = 'l2',
     * 'clf__alpha': [1e-4, 1e-5, 1e-6],
     
Score: 0.773987, alpha=1e-5

Run5: 
     * min_df = 1,
     * max_df = 0.9
     * max_iter = 100
     * tol = None
     * ngram_range=(1,2)
     * use_idf = True,
     * 'clf__loss': ['hinge', 'log'],
     * 'clf__penalty': ['l2', 'l1'],
     * 'clf__alpha': [1e-5],
     
Score: 0.7733, alpha=1e-05, loss='log', penalty: 'l2'

Run6: 
     * min_df = 1,
     * max_df = 0.9
     * max_iter = 100
     * tol = None
     * ngram_range=(1,2)
     * use_idf = True,
     * 'clf__loss': ['hinge', 'log'],
     * 'clf__penalty': ['l2', 'elasticnet'],
     * 'clf__alpha': [5e-5, 1e-5, 5e-6],
     
Score: 0.7761, alpha=5e-06, loss='log', penalty='l2'

Run7: 
     * min_df = 1,
     * max_df = 0.9
     * max_iter = 100
     * tol = None
     * ngram_range=(1,2)
     * use_idf = True,
     * 'clf__loss': ['modified_huber'],
     * 'clf__penalty': ['l2', 'elasticnet'],
     * 'clf__alpha': [1e-5, 5e-6],
     * 'clf__epsilon': [0.01, 0.1, 1],
     
Score: 0.773966, alpha=1e-05, epsilon=0.01, penalty='elasticnet'

Run8: 
     * min_df = 1,
     * max_df = 0.9
     * max_iter = 100
     * tol = None
     * ngram_range=(1,2)
     * use_idf = True,
     * 'clf__loss': ['modified_huber','log'],
     * 'clf__penalty': ['l2', 'elasticnet'],
     * 'clf__alpha': [1e-5],
     * 'clf__epsilon': [0.01, 0.001],
     * 'clf__learning_rate': ['invscaling', 'adaptive'],
     * 'clf__eta0': [10, 1, 0.1]
     
Score: 0.7765, epsilon=0.01, eta0=10, learning_rate='invscaling', loss='modified_huber', penalty='elasticnet'

Run9:
     * min_df = 1,
     * max_df = 0.9
     * max_iter = 100
     * tol = None
     * ngram_range=(1,2)
     * use_idf = True,
     * 'clf__loss': ['modified_huber'],
     * 'clf__penalty': ['elasticnet'],
     * 'clf__alpha': [1e-5],
     * 'clf__epsilon': [0.01],
     * 'clf__learning_rate': ['invscaling', 'adaptive'],
     * 'clf__eta0': [10, 100, 1000]
     
Score: 0.7765, epsilon=0.01, eta0=10, learning_rate='invscaling'

In [None]:
for param_name in sorted(parameters.keys()):
    print("%s: %r" % (param_name, gs_clf.best_params_[param_name]))

In [None]:
plt.figure()
cnf_matrix = confusion_matrix(y_test, predicted)
plot_confusion_matrix(cnf_matrix, classes=binary_labels, normalize=True,
                      title='Confusion matrix, with normalization')
plt.savefig('../figures/tweets_from_03_12_18/binaryclass/tfidftransformer_sgdclassifier_confusion_matrix')
plt.show()

In [None]:
plt.figure()
importance = get_most_important_features(gs_clf.best_estimator_.get_params()['vect'], gs_clf.best_estimator_.get_params()['clf'], 10)
plot_important_words_binary(importance, gs_clf.best_estimator_.get_params()['clf'].classes_, "Most important words")
plt.savefig('../figures/tweets_from_03_12_18/binaryclass/tfidftransformer_sgdclassifier_feature_importances')
plt.show()

End Advanced SGDClassifier

----------------------------

Start Advanced RandomForest

In [None]:
X_train, X_test, y_train, y_test = train_test_split(corpus, y, test_size=0.4, random_state=0)
pipe_clf = Pipeline([
    ('vect', CountVectorizer(max_df=0.9, min_df=5, ngram_range=(1,2))),
    ('tfidf', TfidfTransformer(use_idf=True)),
    ('clf', RandomForestClassifier(n_estimators=100, n_jobs=-1, random_state=0))
])

parameters = {
     #'vect__ngram_range': [(1, 1), (1, 2)],
     #'vect__max_df': [0.9, 1.0],
     #'tfidf__use_idf': (True, False)''
     #'clf__criterion': ['gini', 'entropy'],
     #'clf__max_features': ['log2', 'auto', 0.5],
}
gs_clf = GridSearchCV(pipe_clf, parameters, cv=3, iid=False, n_jobs=-1)
gs_clf.fit(X_train, y_train)
predicted = gs_clf.predict(X_test)
np.mean(predicted == y_test)  

Test runs from tweets until 22.11.2018:

Run1:
     * min_df = 1,
     * max_df = 0.9
     * ngram_range=(1,2)
     * use_idf = True,
     * max_depth = None
     * n_estimators=10

Score 0.71418

Run2:
     * min_df = 1,
     * max_df = 0.9
     * ngram_range=(1,2)
     * use_idf = True,
     * max_depth = None
     * n_estimators=100

Score: 0.7478993357500576

Run3:
     * min_df = 1,
     * max_df = 0.9
     * ngram_range=(1,2)
     * use_idf = True,
     * max_depth = None
     * n_estimators=1000

Score: 0.7520482786077992

In [None]:
for param_name in sorted(parameters.keys()):
    print("%s: %r" % (param_name, gs_clf.best_params_[param_name]))

In [None]:
plt.figure()
cnf_matrix = confusion_matrix(y_test, predicted)
plot_confusion_matrix(cnf_matrix, classes=binary_labels, normalize=True,
                      title='Confusion matrix, with normalization')
plt.savefig('../figures/tweets_from_03_12_18/binaryclass/tfidftransformer_randomforestclassifier_confusion_matrix')
plt.show()

End Advanced Random Forest

----------------------------

Start Logistic Regression

In [None]:
X_train, X_test, y_train, y_test = train_test_split(corpus, y, test_size=0.4, random_state=0)
pipe = Pipeline([
    ('vect', CountVectorizer(max_df=0.9, min_df=5, ngram_range=(1,2))),
    ('tfidf', TfidfTransformer(use_idf=True))
])

parameters = {
     #'vect__ngram_range': [(1, 1), (1, 2)],
     #'vect__max_df': [0.9, 1.0],
     #'tfidf__use_idf': (True, False)''
#     'clf__solver': ['newton-cg', 'lbfgs', 'sag', 'saga'],
#     'clf__C': [0.1, 1, 10, 30, 100],
}
pipe.fit(X_train) 
gs_clf = LogisticRegressionCV(multi_class='auto', cv=3, n_jobs=-1)
gs_clf.fit(pipe.transform(X_train), y_train)
predicted = gs_clf.predict(pipe.transform(X_test))
np.mean(predicted == y_test)  

Test runs (File from 29.11):

Run 1:
 * min_df = 1,
 * max_df = 0.9
 * ngram_range=(1,2)
 * use_idf = True,
 * multi_class = 'auto

score: 0.78506

In [None]:
for param_name in sorted(parameters.keys()):
    print("%s: %r" % (param_name, gs_clf.best_params_[param_name]))

In [None]:
plt.figure()
cnf_matrix = confusion_matrix(y_test, predicted)
plot_confusion_matrix(cnf_matrix, classes=binary_labels, normalize=True,
                      title='Confusion matrix, with normalization')
plt.show()

In [None]:
plt.figure()
importance = get_most_important_features(pipe.get_params()['vect'], gs_clf, 10)
plot_important_words_binary(importance, gs_clf.classes_, "Most important words")
plt.show()

End Logistic Regression

----------------------------

Start AdaBoost

In [None]:
X_train, X_test, y_train, y_test = train_test_split(corpus, y, test_size=0.4, random_state=0)
pipe_clf = Pipeline([
    ('vect', CountVectorizer(max_df=0.9, min_df=5, ngram_range=(1,2))),
    ('tfidf', TfidfTransformer(use_idf=True)),
    ('clf', AdaBoostClassifier(n_estimators=1000))
])

parameters = {
     #'vect__ngram_range': [(1, 1), (1, 2)],
     #'vect__max_df': [0.9, 1.0],
     #'tfidf__use_idf': (True, False)''
     'clf__base_estimator': [DecisionTreeClassifier(max_depth=1)],
     #'clf__C': [0.1, 1, 10, 30, 100],
}

gs_clf = RandomizedSearchCV(pipe_clf, parameters, cv=3, n_jobs=-1)
gs_clf.fit(X_train, y_train)
predicted = gs_clf.predict(X_test)
np.mean(predicted == y_test)  

Test runs (File from 29.11.18):
    
Run1:
 * min_df = 1,
 * max_df = 0.9
 * ngram_range=(1,2)
 * use_idf = True,
 * n_estimators=10
 * clf__base_estimator: [DecisionTreeClassifier(max_depth=1)]
 
Score: 0.694144

Run2:
 * min_df = 1,
 * max_df = 0.9
 * ngram_range=(1,2)
 * use_idf = True,
 * n_estimators=10
 * clf__base_estimator: [MultinomialNB(alpha=0.1)]
  
Score: 0.63089

Run3:
 * min_df = 1,
 * max_df = 0.9
 * ngram_range=(1,2)
 * use_idf = True,
 * n_estimators=10
 * clf__base_estimator: [SGDClassifier(random_state=0, loss='modified_huber', penalty='elasticnet',
                                alpha=1e-5, epsilon=0.01, learning_rate='invscaling', eta0=10,
                                max_iter=100, tol=None)]
Score: 0.63

Run4:
 * min_df = 1,
 * max_df = 0.9
 * ngram_range=(1,2)
 * use_idf = True,
 * n_estimators=10
 * clf__base_estimator: [SGDClassifier(random_state=0, loss='log', penalty='l2',
                                alpha=1e-5)]

0.63026

Run5:
 * min_df = 1,
 * max_df = 0.9
 * ngram_range=(1,2)
 * use_idf = True,
 * n_estimators=1000
 * clf__base_estimator: [DecisionTreeClassifier(max_depth=1)]

Score 0.7532143119535669 (overfit)

In [None]:
for param_name in sorted(parameters.keys()):
    print("%s: %r" % (param_name, gs_clf.best_params_[param_name]))

In [None]:
plt.figure()
cnf_matrix = confusion_matrix(y_test, predicted)
plot_confusion_matrix(cnf_matrix, classes=binary_labels, normalize=True,
                      title='Confusion matrix, with normalization')
plt.show()

End AdaBoost

----------------------------

Start GradientBoosting

In [None]:
X_train, X_test, y_train, y_test = train_test_split(corpus, y, test_size=0.4, random_state=0)
pipe_clf = Pipeline([
    ('vect', CountVectorizer(max_df=0.9, min_df=5, ngram_range=(1,2))),
    ('tfidf', TfidfTransformer(use_idf=True)),
    ('clf', GradientBoostingClassifier(n_estimators=100, random_state=0, verbose=1))
])

parameters = {
     #'vect__ngram_range': [(1, 1), (1, 2)],
     #'vect__max_df': [0.9, 1.0],
     #'tfidf__use_idf': (True, False)''
     'clf__learning_rate': [0.001, 0.01, 0.1, 1],
     'clf__max_depth': [1, 3, 5],
     'clf__loss' : ['deviance', 'exponential']
}
gs_clf = RandomizedSearchCV(pipe_clf, parameters, cv=3, n_jobs=-1, verbose=2)
gs_clf.fit(X_train, y_train)
predicted = gs_clf.predict(X_test)
np.mean(predicted == y_test)  

Test runs (File from 29.11.2018):

Run1:
 * min_df = 1,
 * max_df = 0.9
 * ngram_range=(1,2)
 * use_idf = True,
 * n_estimators=100
 * 'clf__learning_rate': [0.001, 0.01, 0.1, 1],
 * 'clf__max_depth': [1, 3, 5],
 * 'clf__loss' : ['deviance', 'exponential']
 
 
   Iter       Train Loss   Remaining Time 
         1           1.2279            5.40m
         2           1.1986            5.10m
         3           1.1803            4.95m
         4           1.1670            4.77m
         5           1.1555            4.74m
         6           1.1466            4.63m
         7           1.1363            4.61m
         8           1.1267            4.54m
         9           1.1186            4.48m
        10           1.1115            4.42m
        20           1.0570            3.87m
        30           1.0211            3.40m
        40           0.9952            2.89m
        50           0.9714            2.40m
        60           0.9509            1.91m
        70           0.9332            1.43m
        80           0.9178           57.16s
        90           0.9044           28.45s
       100           0.8921            0.00s
       
Score: 0.7418999338770113, clf__learning_rate: 1, clf__loss: 'deviance', clf__max_depth: 5

In [None]:
for param_name in sorted(parameters.keys()):
    print("%s: %r" % (param_name, gs_clf.best_params_[param_name]))

In [None]:
plt.figure()
cnf_matrix = confusion_matrix(y_test, predicted)
plot_confusion_matrix(cnf_matrix, classes=binary_labels, normalize=True,
                      title='Confusion matrix, with normalization')
plt.show()