In [None]:
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
import matplotlib.pyplot  as plt
import itertools
import gensim
import logging
logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)

import nltk
nltk.download('twitter_samples')
nltk.download('stopwords')

In [None]:
import string
import re
 
from nltk.corpus import stopwords 
stopwords_german = stopwords.words('german')
 
from nltk.stem.snowball import SnowballStemmer
stemmer = SnowballStemmer('german')
 
from nltk.tokenize import TweetTokenizer
 
# Happy Emoticons
emoticons_happy = set([
    ':-)', ':)', ';)', ':o)', ':]', ':3', ':c)', ':>', '=]', '8)', '=)', ':}',
    ':^)', ':-D', ':D', '8-D', '8D', 'x-D', 'xD', 'X-D', 'XD', '=-D', '=D',
    '=-3', '=3', ':-))', ":'-)", ":')", ':*', ':^*', '>:P', ':-P', ':P', 'X-P',
    'x-p', 'xp', 'XP', ':-p', ':p', '=p', ':-b', ':b', '>:)', '>;)', '>:-)',
    '<3'
    ])
 
# Sad Emoticons
emoticons_sad = set([
    ':L', ':-/', '>:/', ':S', '>:[', ':@', ':-(', ':[', ':-||', '=L', ':<',
    ':-[', ':-<', '=\\', '=/', '>:(', ':(', '>.<', ":'-(", ":'(", ':\\', ':-c',
    ':c', ':{', '>:\\', ';('
    ])
 
# all emoticons (happy + sad)
emoticons = emoticons_happy.union(emoticons_sad)
 
def clean_tweets(tweet):
    # remove stock market tickers like $GE
    tweet = re.sub(r'\$\w*', '', tweet)
 
    # remove old style retweet text "RT"
    tweet = re.sub(r'^RT[\s]+', '', tweet)
 
    # remove hyperlinks
    tweet = re.sub(r'https?:\/\/[^\s]*', '', tweet)
    
    # remove hashtags
    # only removing the hash # sign from the word
    tweet = re.sub(r'#', '', tweet)
    
    # replace years with 'ayearzzz'-Token
    tweet = re.sub(r'([1-2][0-9]{3})', r'ayearzzz', tweet)
    
    # replace numbers with 'anumberzzz'-Token, only numbers outside of words
    tweet = re.sub(r'(?<![0-9a-zA-Z])[0-9]+(?![0-9a-zA-Z])', r'anumberzzz', tweet)
 
    # tokenize tweets
    tokenizer = TweetTokenizer(preserve_case=False, strip_handles=True, reduce_len=True)
    tweet_tokens = tokenizer.tokenize(tweet)
 
    tweets_clean = []    
    for word in tweet_tokens:
        if (word not in stopwords_german and # remove stopwords
              word not in emoticons and # remove emoticons
                word not in string.punctuation): # remove punctuation
            #tweets_clean.append(word)
            stem_word = stemmer.stem(word) # stemming word
            tweets_clean.append(stem_word)
    tweets_clean=" ".join(tweets_clean)
    
    # remove numbers that were pulled out of words by tokenizer
    tweets_clean = re.sub(r'(?<![0-9a-zA-Z])[0-9]+(?![0-9a-zA-Z])', r'', tweets_clean)
    
    return tweets_clean
 
custom_tweet = "RT @Twitter @chapagain Hello There! Have a great day. :) #good #morning http://chapagain.com.np"
 
# print cleaned tweet
print (clean_tweets(custom_tweet))

End of definitions

------------------

Start data preparation



In [None]:
importdf=pd.read_csv('../data/trainingssets/all_emoji_tweets_03_12_18_7_labels_excluded.csv', sep =';', usecols=['tweet_full_text', 'target'])
importdf.dropna(inplace=True)
importdf.reset_index(inplace=True, drop=True)

In [None]:
all_targets = importdf['target'].astype(str).values.tolist()

In [None]:
y=[]
for i in range(len(all_targets)):
    #Only use first emoji per tweet for now
    y.append(all_targets[i].split(',')[0])

In [None]:
our_tweets = importdf['tweet_full_text'].astype(str).values.tolist()

In [None]:
corpus=[]
for i in range(len(y)):
    corpus.append(clean_tweets(our_tweets[i]))
corpus

In [None]:
# get corpus token lists
class CorpusSentenceTokenList:
    def __init__(self, corpus):
        self.corpus = corpus

    def __iter__(self):
        for tweet in self.corpus:
            yield tweet.split()

In [None]:
# Test bigram phrases
'''
token_list = CorpusTokenList(corpus[0:5])
phrases = gensim.models.Phrases(token_list, min_count=1)
i = 0
for token in token_list:
    print(token)
    i += 1
    if i > 100:
        break
phrases.add_vocab([['dis', 'is','frankfurt', 'zweit', 'spiel'],[u'the', u'mayor', u'of', u'new', u'york', u'was', u'there'],
   [u'machine', u'learning', 'frankfurt', 'zweit', 'spiel', u'can', u'be', u'new', u'york' , u'sometimes']] )
for phrase, score in phrases.export_phrases(token_list):
    print(phrase, score)
    i += 1
    if i > 100:
        break
phrases[['eintracht', 'frankfurt', 'zweit', 'spiel', 'folg', 'anumberzzz', 'anumberzzz', 'anumberzzz', 'spielminut', 'wtf', 'kaum', 'kovac', '...', 'sgeom']]
'''

In [None]:
corpus_sentences = CorpusSentenceTokenList(corpus)

In [None]:
phrases = gensim.models.Phrases(corpus_sentences, threshold=50, min_count=20)

In [None]:
i = 0
for phrase, score in phrases.export_phrases(corpus_sentences):
    print(phrase,' ',score)
    i+=1
    if i > 1000:
        break

In [None]:
i = 0
for tweet in corpus_sentences:
    for word in phrases[tweet]:
        if '_' in word:
            i += 1
            print(tweet)
            print(phrases[tweet])
    if i > 100:
        break


In [None]:
i = 0
for sentence in phrases[corpus_sentences]:
    i+=1
    print(sentence)
    if i > 100:
        break

End Data preparation 

-----------------------------

Start word2vec model training

In [None]:
# train the model
model = gensim.models.Word2Vec(
    phrases[corpus_sentences],
    size=600,
    window=5,
    min_count=5,
    workers=2,
    sg=1,
    hs=1,
    negative=10,
    cbow_mean=0
)

In [None]:
# store model
model.wv.save_word2vec_format('../data/word2vec/tweets_until_29_11_18_model3.model', binary=True)

tweets_until_29_11_18_model1.model:
 * size=300,
 * window=5,
 * min_count=20,
 * workers=2,
 * sg=1,
 * hs=1,
 * negative=10,
 * cbow_mean=0
 * phrases incorrect
 
tweets_until_29_11_18_model2.model:
 * size=500,
 * window=5,
 * min_count=1,
 * workers=2,
 * sg=1,
 * hs=1,
 * negative=10,
 * cbow_mean=0
 * phrases incorrect
 
tweets_until_29_11_18_model3.model:
 * size=600,
 * window=5,
 * min_count=5,
 * workers=2,
 * sg=1,
 * hs=1,
 * negative=10,
 * cbow_mean=0
 * phrases correct
 