In [1]:
import pandas as pd
import numpy as np
import re
from nltk.corpus import stopwords, wordnet
from nltk.tokenize import TweetTokenizer, RegexpTokenizer
from nltk.stem import WordNetLemmatizer
from sklearn.feature_extraction.text import CountVectorizer,TfidfVectorizer
from sklearn.decomposition import PCA

from code.cleaner import preprocess

In [2]:
sentiments = pd.read_csv('./data/judge-1377884607_tweet_product_company.csv',encoding='ISO-8859-1')
sentiments.tail()

Unnamed: 0,tweet_text,emotion_in_tweet_is_directed_at,is_there_an_emotion_directed_at_a_brand_or_product
9088,Ipad everywhere. #SXSW {link},iPad,Positive emotion
9089,"Wave, buzz... RT @mention We interrupt your re...",,No emotion toward brand or product
9090,"Google's Zeiger, a physician never reported po...",,No emotion toward brand or product
9091,Some Verizon iPhone customers complained their...,,No emotion toward brand or product
9092,Ï¡Ïàü_ÊÎÒ£Áââ_£â_ÛâRT @...,,No emotion toward brand or product


In [3]:
# sentiments['product'] = sentiments['emotion_in_tweet_is_directed_at']
# sentiments['sentiment'] = sentiments['is_there_an_emotion_directed_at_a_brand_or_product']

sentiments.rename(columns={
    'emotion_in_tweet_is_directed_at':'product',
    'is_there_an_emotion_directed_at_a_brand_or_product':'sentiment'
},inplace=True)

In [4]:
sentiments.dropna(subset=['tweet_text'],inplace=True)

In [5]:
sw = stopwords.words('english')
specific_words = ['@mention','link','sxsw','#sxsw','@sxsw']
# ^^ these are the original extended stop words from initial data
# discovery and knowledge 
discovered_words = [
    'google','iphone', 'ipad', 'android', 'app',
    'apple', 'rt', 'quot', 'store', 'new', 'austin'
    ]
specific_words.extend(discovered_words)
sw.extend(specific_words)

In [6]:
preprocess(sentiments['tweet_text'],sw)

['@wesley g hr tweet #rise dead need upgrade plugin station',
 '@jessedee know @fludapp awesome likely appreciate design also give free t',
 '@swonderlin wait #ipad also sale',
 'hope year festival crashy year',
 '@sxtxstate great stuff fri marissa mayer tim reilly tech book conference amp matt mullenweg wordpress',
 '@teachntech apps #speechtherapy communication showcased conference http ht ly n #iear #edchat #asd',
 'start #ctia around corner #googleio hop skip jump good time #android fan',
 'beautifully smart simple idea @madebymany @thenextweb write #hollergram http bit ly ieavob',
 'counting day plus strong canadian dollar mean stock gear',
 'excited meet @samsungmobileus show sprint galaxy still run #fail',
 'find amp start impromptu party @hurricaneparty http bit ly gvlrin wait til come',
 'foursquare ups game time http j mp grn pk still prefer @gowalla far best look date',
 'gotta love calendar feature top party show case check @hamsandwich via @ischafer gt http bit ly axzwxb',

In [7]:
pos_neg = sentiments[
                (sentiments['sentiment'] == 'Positive emotion') |
                (sentiments['sentiment'] == 'Negative emotion')
                ]

In [8]:
pos_neg_tokens = preprocess(pos_neg['tweet_text'],sw=sw)
sentiment_tokens = preprocess(sentiments['tweet_text'],sw=sw)

In [9]:
tfidf = TfidfVectorizer(ngram_range=[1,3],max_df=0.4,min_df=20)
X = tfidf.fit_transform(pos_neg_tokens)

In [10]:
vect = pd.DataFrame(X.toarray(),columns=tfidf.get_feature_names())
vect.head()

Unnamed: 0,action,already,also,amaze,america,amp,amp launch,android,another,anyone,...,white,win,wish,work,world,would,wow,year,yes,yet
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.453554,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.65922,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.848161,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.24797,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [11]:
pca = PCA(n_components=0.85,random_state=13)

In [12]:
pca.fit_transform(vect,pos_neg['sentiment'])

array([[-0.00399322, -0.07584209, -0.0390088 , ...,  0.00735729,
         0.01100718,  0.00445185],
       [-0.03363339, -0.10235431,  0.00973906, ..., -0.03188175,
         0.01191654,  0.00434093],
       [-0.05826156,  0.15126306, -0.04998455, ..., -0.01665823,
         0.02351447,  0.03175628],
       ...,
       [ 0.05716259, -0.03660354,  0.02700429, ...,  0.01210775,
         0.0023904 ,  0.02284946],
       [-0.0151531 , -0.10477841, -0.05461386, ..., -0.04493121,
        -0.09206203, -0.07977158],
       [-0.00070624, -0.02379904, -0.02063878, ...,  0.00312998,
         0.00062806,  0.00219268]])

In [13]:
len(pca.explained_variance_),pca.explained_variance_ratio_#.cumsum()

(191,
 array([0.01897255, 0.01565831, 0.01510667, 0.01230149, 0.01192817,
        0.0116874 , 0.01151951, 0.01076177, 0.01030256, 0.00954774,
        0.00945606, 0.009269  , 0.00921173, 0.00885157, 0.00861984,
        0.00846214, 0.00827506, 0.00821535, 0.00805788, 0.00778056,
        0.00772971, 0.00752507, 0.00745596, 0.00730924, 0.00724863,
        0.00715212, 0.00701799, 0.00699083, 0.00694318, 0.00680086,
        0.0067434 , 0.00659302, 0.0065327 , 0.00643894, 0.00625987,
        0.00620216, 0.006111  , 0.00594758, 0.00587864, 0.00582428,
        0.00575209, 0.0057252 , 0.00565978, 0.00558185, 0.00550131,
        0.00545032, 0.0053867 , 0.00529839, 0.00525142, 0.00514672,
        0.00509541, 0.00504526, 0.00503079, 0.00489962, 0.00486934,
        0.00484179, 0.00480355, 0.00477283, 0.00467879, 0.0046548 ,
        0.00461583, 0.00460476, 0.00455334, 0.00449851, 0.00447029,
        0.00443872, 0.00441139, 0.00439134, 0.00435361, 0.00426227,
        0.00424301, 0.00420962, 0.00417441