In [1]:
from IPython.core.display import HTML, display
HTML("<style>.container { width:100% !important; }</style>")
%matplotlib inline
import matplotlib
import matplotlib.pyplot as plt
import seaborn as sns
matplotlib.rcParams['savefig.dpi'] = 2 * matplotlib.rcParams['savefig.dpi']



In [2]:
import pandas as pd
import numpy as np
from Data.sentiment_dict import positive, negative
from sklearn.feature_extraction import DictVectorizer
pd.options.display.max_colwidth=250

In [3]:
# separate tweet into parts with emojis of interest and the rest
def separate_emoji(tweet, emojis):
    emojis = set(tweet).intersection(emojis)
    text = ''
    for i in tweet:
        if i not in emojis:
            text += i
    return text, dict.fromkeys(emojis,1)

In [4]:
# specify the location of the tweets
negtweet_file ='./Data/negtweets.txt'
postweet_file ='./Data/postweets.txt'

In [5]:
# initial dataframes, one for tweets, one for emoji of interest
tweet_df = []
emoji_df = []
# load all the tweets from positive file
with open(postweet_file,'r') as f:
    for line in f:
        tweet = line.decode('utf-8')
        emojis = positive+negative
        text_i, emojis_i =separate_emoji(tweet, emojis)
        emoji_df.append(emojis_i) 
        tweet_df.append(text_i)
# load all the tweets from negative file
with open(negtweet_file,'r') as f:
    for line in f:
        tweet = line.decode('utf-8')
        emojis = positive+negative
        text_i, emojis_i =separate_emoji(tweet, emojis)
        emoji_df.append(emojis_i) 
        tweet_df.append(text_i)

# convert data into dataframe
tweet_df = pd.DataFrame(tweet_df, columns=['tweet'])
DV=DictVectorizer()
emoji_df=DV.fit_transform(emoji_df)
emoji_list = {}
for i in DV.feature_names_:
    emoji_list[i.encode('raw_unicode_escape')] = i
emoji_list=pd.DataFrame(emoji_list, index=[0])
emoji_list = emoji_list[[i.encode('raw_unicode_escape') for i in DV.feature_names_]]
emoji_df=pd.DataFrame(emoji_df.toarray(), columns=[i.encode('raw_unicode_escape') for i in DV.feature_names_])

In [6]:
# combine tweet_df with emoji_df
tweet_with_emoji_df = tweet_df.copy()
tweet_with_emoji_df[emoji_df.columns] = emoji_df.copy()

In [7]:
# tweet_with_emoji_df.head()

In [8]:
from __future__ import print_function
from time import time

from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.decomposition import NMF, LatentDirichletAllocation
from sklearn.datasets import fetch_20newsgroups

In [9]:
n_samples = 2000
n_features = 1000
n_topics = 10
n_top_words = 20


def print_top_words(model, feature_names, n_top_words):
    for topic_idx, topic in enumerate(model.components_):
        print("Topic #%d:" % topic_idx)
        print(" ".join([feature_names[i]
                        for i in topic.argsort()[:-n_top_words - 1:-1]]))
    print()

data_samples = tweet_df.tweet.values

In [10]:
# Use tf-idf features for NMF.
print("Extracting tf-idf features for NMF...")
tfidf_vectorizer = TfidfVectorizer(max_df=0.95, min_df=2, #max_features=n_features,
                                   stop_words='english')
t0 = time()
tfidf = tfidf_vectorizer.fit_transform(data_samples)
print("done in %0.3fs." % (time() - t0))

Extracting tf-idf features for NMF...
done in 0.833s.


In [11]:
# Use tf (raw term count) features for LDA.
print("Extracting tf features for LDA...")
tf_vectorizer = CountVectorizer(max_df=0.95, min_df=2, max_features=n_features,
                                stop_words='english')
t0 = time()
tf = tf_vectorizer.fit_transform(data_samples)
print("done in %0.3fs." % (time() - t0))

# Fit the NMF model
print("Fitting the NMF model with tf-idf features,"
      "n_samples=%d and n_features=%d..."
      % (n_samples, n_features))
t0 = time()
nmf = NMF(n_components=n_topics, random_state=1, alpha=.1, l1_ratio=.5).fit(tfidf)

print("done in %0.3fs." % (time() - t0))

print("\nTopics in NMF model:")
tfidf_feature_names = tfidf_vectorizer.get_feature_names()
print_top_words(nmf, tfidf_feature_names, n_top_words)

print("Fitting LDA models with tf features, n_samples=%d and n_features=%d..."
      % (n_samples, n_features))
lda = LatentDirichletAllocation(n_topics=n_topics, max_iter=5,
                                learning_method='online', learning_offset=50.,
                                random_state=0)
t0 = time()
lda.fit(tf)
print("done in %0.3fs." % (time() - t0))

print("\nTopics in LDA model:")
tf_feature_names = tf_vectorizer.get_feature_names()
print_top_words(lda, tf_feature_names, n_top_words)


Extracting tf features for LDA...
done in 0.812s.
Fitting the NMF model with tf-idf features,n_samples=2000 and n_features=1000...
done in 2.945s.

Topics in NMF model:
Topic #0:
love follow laurenfollowspree baby bellletstalk omg hi facetimemenash make amp hey lauren girl life ya dream song video beautiful nash
Topic #1:
miss baby man boyfriend really guys days friend home babe gonna come brother amp going soon hair best bae old
Topic #2:
thank god babe omg boo aw girl baby sweet youuu awh haha awe rt follow ok youu awesome oh beautiful
Topic #3:
good morning day night today sleep hope look luck feel thing really feeling looks damn great time bad looking girl
Topic #4:
thanks follow girl boo following rt lol lot man oh babe haha hey hi ily great sharing laurenfollowspree check connect
Topic #5:
need sleep really new right stop help phone hair nails life friends come massage nap amp asap house rn hug
Topic #6:
just want really wanna sleep got bad home did today amp baby work day man li