In [None]:
import nltk
nltk.download('twitter_samples')
nltk.download('stopwords')

In [None]:
import pandas as pd
import numpy as np
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import CountVectorizer
import matplotlib.pyplot as plt

In [None]:
importdf=pd.read_csv('../data/trainingssets/all_emoji_tweets_03_12_18_7_labels_excluded.csv', sep =';', usecols=['tweet_full_text', 'target'])
importdf = importdf.dropna()
our_tweets=importdf['tweet_full_text'].astype(str).values.tolist()
our_targets = importdf['target'].astype(str).values.tolist()
our_tweets

In [None]:
our_targets

– Remove stock market tickers like $GE
– Remove retweet text “RT”
– Remove hyperlinks
– Remove hashtags (only the hashtag # and not the word)
– Remove stop words like a, and, the, is, are, etc.
– Remove emoticons like :), :D, :(, :-), etc.
– Remove punctuation like full-stop, comma, exclamation sign, etc.
– Convert words to Stem/Base words using Porter Stemming Algorithm. E.g. words like ‘working’, ‘works’, and ‘worked’ will be converted to their base/stem word “work”.

In [None]:
import string
import re
 
from nltk.corpus import stopwords 
stopwords_german = stopwords.words('german')
 
from nltk.stem.snowball import SnowballStemmer
stemmer = SnowballStemmer('german')
 
from nltk.tokenize import TweetTokenizer
 
# Happy Emoticons
emoticons_happy = set([
    ':-)', ':)', ';)', ':o)', ':]', ':3', ':c)', ':>', '=]', '8)', '=)', ':}',
    ':^)', ':-D', ':D', '8-D', '8D', 'x-D', 'xD', 'X-D', 'XD', '=-D', '=D',
    '=-3', '=3', ':-))', ":'-)", ":')", ':*', ':^*', '>:P', ':-P', ':P', 'X-P',
    'x-p', 'xp', 'XP', ':-p', ':p', '=p', ':-b', ':b', '>:)', '>;)', '>:-)',
    '<3'
    ])
 
# Sad Emoticons
emoticons_sad = set([
    ':L', ':-/', '>:/', ':S', '>:[', ':@', ':-(', ':[', ':-||', '=L', ':<',
    ':-[', ':-<', '=\\', '=/', '>:(', ':(', '>.<', ":'-(", ":'(", ':\\', ':-c',
    ':c', ':{', '>:\\', ';('
    ])
 
# all emoticons (happy + sad)
emoticons = emoticons_happy.union(emoticons_sad)
 
def clean_tweets(tweet):
    # remove stock market tickers like $GE
    tweet = re.sub(r'\$\w*', '', tweet)
 
    # remove old style retweet text "RT"
    tweet = re.sub(r'^RT[\s]+', '', tweet)
 
    # remove hyperlinks
    tweet = re.sub(r'https?:\/\/[^\s]*', '', tweet)
    
    # remove hashtags
    # only removing the hash # sign from the word
    tweet = re.sub(r'#', '', tweet)
    
    # replace years with 'ayearzzz'-Token
    tweet = re.sub(r'([1-2][0-9]{3})', r'ayearzzz', tweet)
    
    # replace numbers with 'anumberzzz'-Token, only numbers outside of words
    tweet = re.sub(r'(?<![0-9a-zA-Z])[0-9]+(?![0-9a-zA-Z])', r'anumberzzz', tweet)
 
    # tokenize tweets
    tokenizer = TweetTokenizer(preserve_case=False, strip_handles=True, reduce_len=True)
    tweet_tokens = tokenizer.tokenize(tweet)
 
    tweets_clean = []    
    for word in tweet_tokens:
        if (word not in stopwords_german and # remove stopwords
              word not in emoticons and # remove emoticons
                word not in string.punctuation): # remove punctuation
            #tweets_clean.append(word)
            stem_word = stemmer.stem(word) # stemming word
            tweets_clean.append(stem_word)
    tweets_clean=" ".join(tweets_clean)
    
    # remove numbers that were pulled out of words by tokenizer
    tweets_clean = re.sub(r'(?<![0-9a-zA-Z])[0-9]+(?![0-9a-zA-Z])', r'', tweets_clean)
    
    return tweets_clean
 
custom_tweet = "RT @Twitter @chapagain Hello There! Have a great day. :) #good #morning http://chapagain.com.np"
 
# print cleaned tweet
print (clean_tweets(custom_tweet))

In [None]:
corpus=[]
for i in range(len(our_tweets)):
    corpus.append(clean_tweets(our_tweets[i]))
corpus

In [None]:
corpus[13:25]

In [None]:
y=[]
for i in range(len(our_targets)):
    #Only use first emoji per tweet for now
    y.append(our_targets[i].split(',')[0])
y

- replace numbers with "a number"-token?
- filter, that deletes non-latin letters?
    - might be hackfixed by min_df-parameter
- spellingfixes?
    - might be hackfixed by min_df-parameter
        -very inelegant, because all misspelled tokens are simply discarded
- do bold font/ different typefaces just have different asci characters?
    - might be hackfixed by min_df-parameter
- 

In [None]:
from sklearn.linear_model import SGDClassifier
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB
from sklearn.model_selection import GridSearchCV
from sklearn.ensemble import RandomForestClassifier
from sklearn import metrics

In [None]:
def get_most_important_features(vectorizer, model, n=5):
    index_to_word = {v:k for k,v in vectorizer.vocabulary_.items()}
    
    # loop for each class
    classes ={}
    for class_index in range(model.coef_.shape[0]):
        word_importances = [(el, index_to_word[i]) for i,el in enumerate(model.coef_[class_index])]
        sorted_coeff = sorted(word_importances, key = lambda x : x[0], reverse=True)
        tops = sorted(sorted_coeff[:n], key = lambda x : x[0])
        bottom = sorted_coeff[-n:]
        classes[class_index] = {
            'tops':tops,
            'bottom':bottom
        }
    return classes

In [None]:
def plot_important_words_binary_classification(top_scores, top_words, bottom_scores, bottom_words, name):
    y_pos = np.arange(len(top_words))
    top_pairs = [(a,b) for a,b in zip(top_words, top_scores)]
    top_pairs = sorted(top_pairs, key=lambda x: x[1])
    
    bottom_pairs = [(a,b) for a,b in zip(bottom_words, bottom_scores)]
    bottom_pairs = sorted(bottom_pairs, key=lambda x: x[1], reverse=True)
    
    top_words = [a[0] for a in top_pairs]
    top_scores = [a[1] for a in top_pairs]
    
    bottom_words = [a[0] for a in bottom_pairs]
    bottom_scores = [a[1] for a in bottom_pairs]
    
    fig = plt.figure(figsize=(10, 10))  

    plt.subplot(121)
    plt.barh(y_pos,bottom_scores, align='center', alpha=0.5)
    plt.title('Irrelevant', fontsize=20)
    plt.yticks(y_pos, bottom_words, fontsize=14)
    plt.suptitle('Key words', fontsize=16)
    plt.xlabel('Importance', fontsize=20)
    
    plt.subplot(122)
    plt.barh(y_pos,top_scores, align='center', alpha=0.5)
    plt.title('Disaster', fontsize=20)
    plt.yticks(y_pos, top_words, fontsize=14)
    plt.suptitle(name, fontsize=16)
    plt.xlabel('Importance', fontsize=20)
    
    plt.subplots_adjust(wspace=0.8)
    plt.show()

In [None]:
from __future__ import unicode_literals
def plot_important_words(importance, class_labels, name):
    fig = plt.figure(figsize=(20,200))
    for i in range(len(importance)):
        top_scores = [a[0] for a in importance[i]['tops']]
        top_words = [a[1] for a in importance[i]['tops']]
        
        y_pos = np.arange(len(top_words))
        top_pairs = [(a,b) for a,b in zip(top_words, top_scores)]
        top_pairs = sorted(top_pairs, key=lambda x: x[1])
    
        top_words = [a[0] for a in top_pairs]
        top_scores = [a[1] for a in top_pairs]
        
        subplot = str(int(len(importance)/2)+1)+str(2)+str(i + 1)
        plt.subplot(int(len(importance)/2)+1, 2, i + 1)
        plt.barh(y_pos,top_scores, align='center', alpha=0.5)
        plt.title(class_labels[i], fontsize=20, fontname='Segoe UI Emoji')
        plt.yticks(y_pos, top_words, fontsize=14)
        plt.suptitle(name, fontsize=16)
        plt.xlabel('Importance', fontsize=14)

    plt.subplots_adjust(wspace=0.8, hspace=0.6)
    plt.show()

Start of simple naive bayes evaluation

In [None]:
X_train, X_test, y_train, y_test = train_test_split(corpus, y, test_size=0.4, random_state=0)
text_clf = Pipeline([
    ('vect', CountVectorizer(max_df=0.9, min_df=0.0001, ngram_range: (1, 2))),
    ('tfidf', TfidfTransformer(_use_idf: True)),
    ('clf', MultinomialNB()),
])
text_clf.fit(X_train, y_train)
predicted = text_clf.predict(X_test)
np.mean(predicted == y_test)   

In [None]:
report_dict = metrics.classification_report(y_test, predicted, output_dict=True)
print(metrics.classification_report(y_test, predicted))

* White space still there
* aFood und anAnimal remarkably good
* Copyright ebenfalls? 
* Rare emojis sometimes pretty good

In [None]:
sorted_by_f1 = sorted(report_dict.items(), key=lambda kv: float(kv[1]['f1-score']))
# Filter lower score labels, the annoying small white-space '' and meta-data
filtered_by_f1 = [x for x in sorted_by_f1 if x[1]['f1-score'] >= 0.1 and x[0] != '' 
                  and x[0] != 'weighted avg' and x[0] != 'micro avg' and x[0] != 'macro avg']
filtered_by_f1.reverse()
remaining_labels = [x[0] for x in filtered_by_f1]
remaining_labels

End of simple NB

Try different models on all data

In [None]:
X_train, X_test, y_train, y_test = train_test_split(corpus, y, test_size=0.4, random_state=0)

text_clf = Pipeline([
    ('vect', CountVectorizer()),
    ('tfidf', TfidfTransformer()),
    ('clf', MultinomialNB()),
])

parameters = {
     'vect__ngram_range': [(1, 1), (1, 2)],
     'vect__min_df': [1e-4, 0.1],
     'vect__max_df': [0.9, 1.0],
     'tfidf__use_idf': (True, False)
}
gs_clf = GridSearchCV(text_clf, parameters, cv=3, iid=False, n_jobs=2)
gs_clf.fit(X_train, y_train)
predicted = gs_clf.predict(X_test)
np.mean(predicted == y_test)  

Best params (score: 0.1337):
* tfidf__use_idf: True
* vect__max_df: 0.9
* vect__min_df: 0.0001
* vect__ngram_range: (1, 2)

In [None]:
X_train, X_test, y_train, y_test = train_test_split(corpus, y, test_size=0.4, random_state=0)

text_clf = Pipeline([
    ('vect', CountVectorizer(ngram_range=(1, 2))),
    ('tfidf', TfidfTransformer(use_idf=True)),
    ('clf', SGDClassifier(loss='hinge', penalty='l2',
                          alpha=1e-3, random_state=0, n_jobs=2,
                          max_iter=10, tol=None)),
])

parameters = {
     #'vect__ngram_range': [(1, 1), (1, 2)],
     #clf__loss': ['hinge', 'log'],
     #'clf__penalty': ['l2', 'l1'],
     #'clf__alpha': [1e-4, 0.1],
     #'vect__max_df': [0.9, 1.0],
     #'tfidf__use_idf': (True, False)''
}
#gs_clf = GridSearchCV(text_clf, parameters, cv=3, iid=False, n_jobs=2)
text_clf.fit(X_train, y_train)
predicted = text_clf.predict(X_test)
np.mean(predicted == y_test)  

Best params (score 0.127):
* loss: 'hinge'
* ngram_range: (1,2)
* use_idf: True

Others:
* loss: 'log' -> score 0.08

In [None]:
X_train, X_test, y_train, y_test = train_test_split(corpus, y, test_size=0.4, random_state=0)

text_clf = Pipeline([
    ('vect', CountVectorizer(ngram_range=(1, 2))),
    ('tfidf', TfidfTransformer(use_idf=True)),
    ('clf', RandomForestClassifier(n_estimators=500, max_depth=6, n_jobs=2, random_state=0)),
])

parameters = {
     #'vect__ngram_range': [(1, 1), (1, 2)],
     #'clf__max_depth': [2, 4],
     #'clf__penalty': ['l2', 'l1'],
     #'clf__alpha': [1e-4, 0.1],
     #'vect__max_df': [0.9, 1.0],
     #'tfidf__use_idf': (True, False)''
}
#gs_clf = GridSearchCV(text_clf, parameters, cv=3, iid=False, n_jobs=2)
text_clf.fit(X_train, y_train)
predicted = text_clf.predict(X_test)
np.mean(predicted == y_test)  

Best score: 0.0739
    * n_estimators: 100
    * max_depth: 2
    * ngram_range: (1,2)
    * use_idf: True

In [None]:
for param_name in sorted(parameters.keys()):
    print("%s: %r" % (param_name, gs_clf.best_params_[param_name]))

End of models on all data

Start with reduced classes

In [None]:
print(len(y),len(corpus))
reduced_corpus = []
reduced_y = []
for i in range(len(y)):
    if y[i] in remaining_labels:
        reduced_corpus.append(corpus[i])
        reduced_y.append(y[i])
print(len(reduced_y),len(reduced_corpus))

In [None]:
X_train, X_test, y_train, y_test = train_test_split(reduced_corpus, reduced_y, test_size=0.4, random_state=0)
text_clf = Pipeline([
    ('vect', CountVectorizer(min_df=60)),
    ('clf', MultinomialNB()),
])
text_clf.fit(X_train, y_train)
predicted = text_clf.predict(X_test)
np.mean(predicted == y_test)  

In [None]:
report_dict = metrics.classification_report(y_test, predicted, output_dict=True)
print(metrics.classification_report(y_test, predicted))

In [None]:
%matplotlib nbagg
importance = get_most_important_features(text_clf.get_params()['vect'], text_clf.get_params()['clf'], 10)

#print(importance[33])

#print(text_clf.get_params()['clf'].classes_)
#top_scores = [a[0] for a in importance[1]['tops']]
#top_words = [a[1] for a in importance[1]['tops']]

plot_important_words(importance, text_clf.get_params()['clf'].classes_, "Most important words for relevance")

* Hohe importance von anumberzzz bei fast allen Labels
* Teilweise logische Zuordnungen (Assasins Creed, Fallout, Fußball, Schnee)
* Das meiste aber eher nicht nachvollziehbar

In [None]:
X_train, X_test, y_train, y_test = train_test_split(reduced_corpus, reduced_y, test_size=0.4, random_state=0)

text_clf = Pipeline([
    ('vect', CountVectorizer(min_df=60)),
    ('tfidf', TfidfTransformer()),
    ('clf', MultinomialNB()),
])

parameters = {
     'vect__ngram_range': [(1, 1), (1, 2)],
     'tfidf__use_idf': (True, False),
     'clf__alpha': (0, 0.1, 1),
}
gs_clf = GridSearchCV(text_clf, parameters, cv=5, iid=False, n_jobs=3)
gs_clf.fit(X_train, y_train)
predicted = gs_clf.predict(X_test)
np.mean(predicted == y_test)  

In [None]:
for param_name in sorted(parameters.keys()):
    print("%s: %r" % (param_name, gs_clf.best_params_[param_name]))

Best params:
* clf__alpha: 0.1
* tfidf__use_idf: True
* vect__ngram_range: (1, 2)

In [None]:
report_dict = metrics.classification_report(y_test, predicted, output_dict=True)
print(metrics.classification_report(y_test, predicted))

In [None]:
%matplotlib nbagg
importance = get_most_important_features(gs_clf.best_estimator_.get_params()['vect'], gs_clf.best_estimator_.get_params()['clf'], 10)
plot_important_words(importance, gs_clf.best_estimator_.get_params()['clf'].classes_, "Most important words for relevance")