In [8]:
from IPython.core.display import HTML, display
HTML("<style>.container { width:100% !important; }</style>")
%matplotlib inline
import matplotlib
import matplotlib.pyplot as plt
import seaborn as sns
matplotlib.rcParams['savefig.dpi'] = matplotlib.rcParams['savefig.dpi']

In [9]:
import pandas as pd
import numpy as np
# import emoji lists from data.sentiment_dict
from Data.sentiment_dict import positive, negative
from sklearn.feature_extraction import DictVectorizer
pd.options.display.max_colwidth=250

In [10]:
# separate tweet into parts with emojis of interest and the rest
def separate_emoji(tweet, emojis):
    emojis = set(tweet).intersection(emojis)
    text = ''
    for i in tweet:
        if i not in emojis:
            text += i
    return text, dict.fromkeys(emojis,1)

In [11]:
# specify the location of the tweets
negtweet_file ='./Data/negtweets.txt'
postweet_file ='./Data/postweets.txt'

In [12]:
# initial lists, one for tweets, one for emoji of interest
tweet_df = [] # so far, it is just lists
emoji_df = []
# load all the tweets from positive file
with open(postweet_file,'r') as f:
    for line in f:
        tweet = line.decode('utf-8')
        emojis = positive+negative
        text_i, emojis_i =separate_emoji(tweet, emojis)
        emoji_df.append(emojis_i) 
        tweet_df.append(text_i)
# load all the tweets from negative file
with open(negtweet_file,'r') as f:
    for line in f:
        tweet = line.decode('utf-8')
        emojis = positive+negative
        text_i, emojis_i =separate_emoji(tweet, emojis)
        emoji_df.append(emojis_i) 
        tweet_df.append(text_i)

# convert data into dataframe
tweet_df = pd.DataFrame(tweet_df, columns=['tweet'])
DV=DictVectorizer()
emoji_df=DV.fit_transform(emoji_df) # this is actually a vector, or matrix, not a df
emoji_list = {}
for i in DV.feature_names_:
    emoji_list[i.encode('raw_unicode_escape')] = i
emoji_list=pd.DataFrame(emoji_list, index=[0])
emoji_list = emoji_list[[i.encode('raw_unicode_escape') for i in DV.feature_names_]]
emoji_df=pd.DataFrame(emoji_df.toarray(), columns=[i.encode('raw_unicode_escape') for i in DV.feature_names_])

In [16]:
emoji_list

Unnamed: 0,\u2639,\u263a,\U0001f600,\U0001f601,\U0001f602,\U0001f603,\U0001f604,\U0001f606,\U0001f607,\U0001f60a,...,\U0001f626,\U0001f627,\U0001f628,\U0001f629,\U0001f62d,\U0001f630,\U0001f631,\U0001f638,\U0001f641,\U0001f642
0,☹,☺,😀,😁,😂,😃,😄,😆,😇,😊,...,😦,😧,😨,😩,😭,😰,😱,😸,🙁,🙂


In [14]:
display(tweet_df)

Unnamed: 0,tweet
0,bro that's so depressing.. Let's go to his house 🙄\n
1,thank you thank you for telling me about them omg i love them ❤x\n
2,"Excellent day at work, laughed all day. Blessed to do my job. #crewlife #averybritishairline #post2276gettingthere 👏🏻 \n"
3,Things are getting there \n
4,I'm so proud of you! Welcome to the team!! #BellLetsTalk #SickNotWeak \n
5,please follow me #BellLetsTalk X18\n
6,thanks girl 💕\n
7,2 weeks from today \n
8,'s snapchats cheer me up so much\n
9,I'm not sure if I'm going yet \n


In [18]:
# combine tweet_df with emoji_df
tweet_with_emoji_df = tweet_df.copy()
tweet_with_emoji_df[emoji_df.columns] = emoji_df.copy()

In [44]:
from sklearn.cross_validation import train_test_split
(train_data, test_data) = train_test_split(tweet_with_emoji_df,random_state=0)

In [19]:
# data_samples = train_data.tweet.values
# data_samples=np.hstack((train_data.tweet.values, test_data.tweet.values))
data_samples = tweet_with_emoji_df.tweet.values

In [46]:
# tweet_with_emoji_df.head()
# test_data.shape


In [21]:
from __future__ import print_function
from time import time

from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.decomposition import NMF, LatentDirichletAllocation
from sklearn.datasets import fetch_20newsgroups

In [22]:
# n_samples = 2000
n_features = 1000
n_topics = 10
n_top_words = 20

def print_top_words(model, feature_names, n_top_words):
    for topic_idx, topic in enumerate(model.components_):
        print("Topic #%d:" % topic_idx)
        print(" ".join([feature_names[i]
                        for i in topic.argsort()[:-n_top_words - 1:-1]]))
    print()



In [49]:
# Use tf-idf features for NMF.
print("Extracting tf-idf features for NMF...")
tfidf_vectorizer = TfidfVectorizer(max_df=0.95, min_df=2, #max_features=n_features,
                                   stop_words='english')
t0 = time()
tfidf = tfidf_vectorizer.fit_transform(data_samples)
print("done in %0.3fs." % (time() - t0))

Extracting tf-idf features for NMF...
done in 0.778s.


In [50]:
# Use tf (raw term count) features for LDA.
print("Extracting tf features for LDA...")
tf_vectorizer = CountVectorizer(max_df=0.95, min_df=2, max_features=n_features,
                                stop_words='english')
t0 = time()
tf = tf_vectorizer.fit_transform(data_samples)
print("done in %0.3fs." % (time() - t0))

# Fit the NMF model
print("Fitting the NMF model with tf-idf features,"
      "n_samples=%d and n_features=%d..."
      % (n_samples, n_features))
t0 = time()
nmf = NMF(n_components=n_topics, random_state=1, alpha=.1, l1_ratio=.5).fit(tfidf)

print("done in %0.3fs." % (time() - t0))

print("\nTopics in NMF model:")
tfidf_feature_names = tfidf_vectorizer.get_feature_names()
print_top_words(nmf, tfidf_feature_names, n_top_words)

print("Fitting LDA models with tf features, n_samples=%d and n_features=%d..."
      % (n_samples, n_features))
lda = LatentDirichletAllocation(n_topics=n_topics, max_iter=5,
                                learning_method='online', learning_offset=50.,
                                random_state=0)
t0 = time()
lda.fit(tf)
print("done in %0.3fs." % (time() - t0))

print("\nTopics in LDA model:")
tf_feature_names = tf_vectorizer.get_feature_names()
print_top_words(lda, tf_feature_names, n_top_words)


Extracting tf features for LDA...
done in 0.773s.
Fitting the NMF model with tf-idf features,n_samples=2000 and n_features=1000...
done in 9.181s.

Topics in NMF model:
Topic #0:
love laurenfollowspree omg amp ya life song bellletstalk girl hi amazing beautiful people make hate best fucking man lol dream
Topic #1:
want don bad puppy hair home food pizza rn work talk bed people meet sushi leave life grow eat ice
Topic #2:
thank god babe boo omg aw girl youuu awh sweet haha awe rt youu ok awesome following lord amp beautiful
Topic #3:
miss man boyfriend guys days friend home babe gonna come brother amp going hair soon best bae old having gone
Topic #4:
like feel don look video new watch facetimemenash looks amp subscribe people going forget think better bitch feels girl im
Topic #5:
good morning night look luck feel hope thing looks feeling don damn looking sounds girl today feels sound time way
Topic #6:
happy birthday hope great make makes bday best enjoy amazing year bless girl wish f

In [52]:
data_lda=lda.transform(tf)

In [53]:
data_lda.shape

(50000, 20)