In [1]:
import pandas as pd
import numpy as np
import re
from nltk.corpus import stopwords, wordnet
from nltk.tokenize import TweetTokenizer, RegexpTokenizer
from nltk.stem import WordNetLemmatizer
from sklearn.feature_extraction.text import CountVectorizer,TfidfVectorizer
from sklearn.decomposition import PCA

from code.cleaner import preprocess

In [2]:
sentiments = pd.read_csv('./data/judge-1377884607_tweet_product_company.csv',encoding='ISO-8859-1')
sentiments.tail()

Unnamed: 0,tweet_text,emotion_in_tweet_is_directed_at,is_there_an_emotion_directed_at_a_brand_or_product
9088,Ipad everywhere. #SXSW {link},iPad,Positive emotion
9089,"Wave, buzz... RT @mention We interrupt your re...",,No emotion toward brand or product
9090,"Google's Zeiger, a physician never reported po...",,No emotion toward brand or product
9091,Some Verizon iPhone customers complained their...,,No emotion toward brand or product
9092,Ï¡Ïàü_ÊÎÒ£Áââ_£â_ÛâRT @...,,No emotion toward brand or product


In [4]:
sentiments['is_there_an_emotion_directed_at_a_brand_or_product'].value_counts()

No emotion toward brand or product    5389
Positive emotion                      2978
Negative emotion                       570
I can't tell                           156
Name: is_there_an_emotion_directed_at_a_brand_or_product, dtype: int64

In [111]:
sentiments['emotion_in_tweet_is_directed_at'].value_counts()

iPad                               946
Apple                              661
iPad or iPhone App                 470
Google                             430
iPhone                             297
Other Google product or service    293
Android App                         81
Android                             78
Other Apple product or service      35
Name: emotion_in_tweet_is_directed_at, dtype: int64

In [5]:
sentiments.dropna(subset=['tweet_text'],inplace=True)

In [30]:
sw = stopwords.words('english')
specific_words = ['@mention','link','sxsw','#sxsw','@sxsw']
sw.extend(specific_words)

In [66]:
preprocess(sentiments['tweet_text'],sw)

['@wesley g iphone hr tweet #rise austin dead need upgrade plugin station',
 '@jessedee know @fludapp awesome ipad iphone app likely appreciate design also give free t',
 '@swonderlin wait #ipad also sale',
 'hope year festival crashy year iphone app',
 '@sxtxstate great stuff fri marissa mayer google tim reilly tech book conference amp matt mullenweg wordpress',
 '@teachntech new ipad apps #speechtherapy communication showcased conference http ht ly n #iear #edchat #asd',
 'nan',
 'start #ctia around corner #googleio hop skip jump good time #android fan',
 'beautifully smart simple idea rt @madebymany @thenextweb write #hollergram ipad app http bit ly ieavob',
 'counting day plus strong canadian dollar mean stock apple gear',
 'excited meet @samsungmobileus show sprint galaxy still run android #fail',
 'find amp start impromptu party @hurricaneparty http bit ly gvlrin wait til android app come',
 'foursquare ups game time http j mp grn pk still prefer @gowalla far best look android ap

In [84]:
emotion = 'is_there_an_emotion_directed_at_a_brand_or_product'

In [85]:
y_n = sentiments\
[
                (sentiments[emotion] == 'Positive emotion') |
                (sentiments[emotion] == 'Negative emotion')
                ]

In [86]:
sentiments[sentiments[emotion] == 'Positive emotion']

Unnamed: 0,tweet_text,emotion_in_tweet_is_directed_at,is_there_an_emotion_directed_at_a_brand_or_product
1,@jessedee Know about @fludapp ? Awesome iPad/i...,iPad or iPhone App,Positive emotion
2,@swonderlin Can not wait for #iPad 2 also. The...,iPad,Positive emotion
4,@sxtxstate great stuff on Fri #SXSW: Marissa M...,Google,Positive emotion
7,"#SXSW is just starting, #CTIA is around the co...",Android,Positive emotion
8,Beautifully smart and simple idea RT @madebyma...,iPad or iPhone App,Positive emotion
...,...,...,...
9072,@mention your iPhone 4 cases are Rad and Ready...,iPhone,Positive emotion
9077,@mention your PR guy just convinced me to swit...,iPhone,Positive emotion
9079,&quot;papyrus...sort of like the ipad&quot; - ...,iPad,Positive emotion
9085,I've always used Camera+ for my iPhone b/c it ...,iPad or iPhone App,Positive emotion


In [87]:
y_n[emotion].value_counts()

Positive emotion    2978
Negative emotion     570
Name: is_there_an_emotion_directed_at_a_brand_or_product, dtype: int64

In [88]:
y_n

Unnamed: 0,tweet_text,emotion_in_tweet_is_directed_at,is_there_an_emotion_directed_at_a_brand_or_product
0,.@wesley83 I have a 3G iPhone. After 3 hrs twe...,iPhone,Negative emotion
1,@jessedee Know about @fludapp ? Awesome iPad/i...,iPad or iPhone App,Positive emotion
2,@swonderlin Can not wait for #iPad 2 also. The...,iPad,Positive emotion
3,@sxsw I hope this year's festival isn't as cra...,iPad or iPhone App,Negative emotion
4,@sxtxstate great stuff on Fri #SXSW: Marissa M...,Google,Positive emotion
...,...,...,...
9077,@mention your PR guy just convinced me to swit...,iPhone,Positive emotion
9079,&quot;papyrus...sort of like the ipad&quot; - ...,iPad,Positive emotion
9080,Diller says Google TV &quot;might be run over ...,Other Google product or service,Negative emotion
9085,I've always used Camera+ for my iPhone b/c it ...,iPad or iPhone App,Positive emotion


In [89]:
y_n_tokens = preprocess(y_n['tweet_text'],sw=sw)

In [90]:
tfidf = TfidfVectorizer(ngram_range=[1,4],max_df=0.5,min_df=10)
X = tfidf.fit_transform(y_n_tokens)

In [91]:
pd.DataFrame(zip(preprocess(sentiments['tweet_text'],sw=sw),sentiments['tweet_text']))

Unnamed: 0,0,1
0,@wesley g iphone hr tweet #rise austin dead ne...,.@wesley83 I have a 3G iPhone. After 3 hrs twe...
1,@jessedee know @fludapp awesome ipad iphone ap...,@jessedee Know about @fludapp ? Awesome iPad/i...
2,@swonderlin wait #ipad also sale,@swonderlin Can not wait for #iPad 2 also. The...
3,hope year festival crashy year iphone app,@sxsw I hope this year's festival isn't as cra...
4,@sxtxstate great stuff fri marissa mayer googl...,@sxtxstate great stuff on Fri #SXSW: Marissa M...
...,...,...
9088,ipad everywhere,Ipad everywhere. #SXSW {link}
9089,wave buzz rt interrupt regularly schedule geek...,"Wave, buzz... RT @mention We interrupt your re..."
9090,google zeiger physician never report potential...,"Google's Zeiger, a physician never reported po..."
9091,verizon iphone customer complain time fell bac...,Some Verizon iPhone customers complained their...


In [92]:
y_n['tweet_text']

0       .@wesley83 I have a 3G iPhone. After 3 hrs twe...
1       @jessedee Know about @fludapp ? Awesome iPad/i...
2       @swonderlin Can not wait for #iPad 2 also. The...
3       @sxsw I hope this year's festival isn't as cra...
4       @sxtxstate great stuff on Fri #SXSW: Marissa M...
                              ...                        
9077    @mention your PR guy just convinced me to swit...
9079    &quot;papyrus...sort of like the ipad&quot; - ...
9080    Diller says Google TV &quot;might be run over ...
9085    I've always used Camera+ for my iPhone b/c it ...
9088                        Ipad everywhere. #SXSW {link}
Name: tweet_text, Length: 3548, dtype: object

In [93]:
vect = pd.DataFrame(X.toarray(),columns=tfidf.get_feature_names())
vect.head()

Unnamed: 0,aclu,action,actually,ad,agree,almost,already,also,always,amaze,...,yay,yeah,year,year time,yes,yes gowalla,yes gowalla win,yes gowalla win best,yet,zazzlesxsw
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.362991,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.451424,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.675077,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [94]:
tfidf.get_feature_names()

['aclu',
 'action',
 'actually',
 'ad',
 'agree',
 'almost',
 'already',
 'also',
 'always',
 'amaze',
 'amazing',
 'america',
 'among',
 'amp',
 'amp google',
 'amp ipad',
 'amp ipad launch',
 'amp physical',
 'amp physical world',
 'amp physical world mobile',
 'analytics',
 'andoid',
 'andoid app',
 'andoid app team',
 'andoid app team android',
 'android',
 'android app',
 'android choice',
 'android choice award',
 'android choice award thanks',
 'android market',
 'android party',
 'android phone',
 'announce',
 'another',
 'anyone',
 'anything',
 'app',
 'app ipad',
 'app iphone',
 'app song',
 'app song info',
 'app store',
 'app store include',
 'app store include uberguide',
 'app team',
 'app team android',
 'app team android choice',
 'apple',
 'apple come',
 'apple come cool',
 'apple come cool technology',
 'apple ipad',
 'apple open',
 'apple open pop',
 'apple open pop store',
 'apple open temp',
 'apple open temp store',
 'apple open temporary',
 'apple open temporary 

In [29]:
PCA?

[0;31mInit signature:[0m
[0mPCA[0m[0;34m([0m[0;34m[0m
[0;34m[0m    [0mn_components[0m[0;34m=[0m[0;32mNone[0m[0;34m,[0m[0;34m[0m
[0;34m[0m    [0;34m*[0m[0;34m,[0m[0;34m[0m
[0;34m[0m    [0mcopy[0m[0;34m=[0m[0;32mTrue[0m[0;34m,[0m[0;34m[0m
[0;34m[0m    [0mwhiten[0m[0;34m=[0m[0;32mFalse[0m[0;34m,[0m[0;34m[0m
[0;34m[0m    [0msvd_solver[0m[0;34m=[0m[0;34m'auto'[0m[0;34m,[0m[0;34m[0m
[0;34m[0m    [0mtol[0m[0;34m=[0m[0;36m0.0[0m[0;34m,[0m[0;34m[0m
[0;34m[0m    [0miterated_power[0m[0;34m=[0m[0;34m'auto'[0m[0;34m,[0m[0;34m[0m
[0;34m[0m    [0mrandom_state[0m[0;34m=[0m[0;32mNone[0m[0;34m,[0m[0;34m[0m
[0;34m[0m[0;34m)[0m[0;34m[0m[0;34m[0m[0m
[0;31mDocstring:[0m     
Principal component analysis (PCA).

Linear dimensionality reduction using Singular Value Decomposition of the
data to project it to a lower dimensional space. The input data is centered
but not scaled for each feature before a

In [95]:
pca = PCA(n_components=0.85)

In [96]:
pca.fit_transform(vect,y_n[emotion])

array([[-3.03587949e-02, -1.41164375e-01, -7.32801596e-02, ...,
         2.75706673e-02,  2.09193650e-02, -1.89788612e-02],
       [-9.59875801e-02, -2.58987078e-01, -1.19538572e-01, ...,
         1.99038537e-03, -3.10681448e-02,  4.63538036e-02],
       [ 3.14084735e-02, -2.78991105e-02, -2.11846918e-02, ...,
         5.86987704e-04, -6.80857924e-03, -1.07409148e-03],
       ...,
       [-1.15404221e-01,  7.84186552e-02,  2.62122528e-01, ...,
         2.42488536e-04,  1.38080159e-02, -2.32319877e-02],
       [-7.64246425e-02, -1.87494813e-01, -7.73961889e-02, ...,
         4.73066318e-03,  1.10984841e-02, -6.04733503e-03],
       [ 4.49565328e-02, -3.37838081e-02, -1.76535102e-02, ...,
         6.22628327e-02,  3.55090384e-02,  1.25749706e-01]])

In [97]:
pca.explained_variance_ratio_.cumsum()

array([0.02109136, 0.03853507, 0.05458821, 0.06797102, 0.08010989,
       0.09047957, 0.10020883, 0.10903326, 0.11773946, 0.12616094,
       0.1343564 , 0.14208212, 0.14936408, 0.15617795, 0.16247036,
       0.16875362, 0.17462499, 0.18046913, 0.18604735, 0.19157276,
       0.19702158, 0.20244513, 0.20773707, 0.21290783, 0.21801306,
       0.22303469, 0.2279858 , 0.23285194, 0.23763704, 0.24235338,
       0.24702481, 0.25158328, 0.25610059, 0.26053478, 0.26492703,
       0.26929317, 0.2735443 , 0.27775178, 0.28189607, 0.28598592,
       0.29003735, 0.29404056, 0.29800016, 0.30187367, 0.30573549,
       0.30953707, 0.31331564, 0.31705673, 0.3207294 , 0.32431405,
       0.32786423, 0.3313835 , 0.3348819 , 0.33832868, 0.34175282,
       0.34514963, 0.34852591, 0.35186419, 0.35517712, 0.35846363,
       0.3617072 , 0.36492926, 0.36809786, 0.3712135 , 0.3743104 ,
       0.37739293, 0.38044524, 0.38345427, 0.38644983, 0.38942907,
       0.39238716, 0.395345  , 0.39827142, 0.40118661, 0.40406

In [98]:
len(pca.components_)

372

In [69]:
display(sentiments.head(10), y_n.head())

Unnamed: 0,tweet_text,emotion_in_tweet_is_directed_at,is_there_an_emotion_directed_at_a_brand_or_product
0,.@wesley83 I have a 3G iPhone. After 3 hrs twe...,iPhone,Negative emotion
1,@jessedee Know about @fludapp ? Awesome iPad/i...,iPad or iPhone App,Positive emotion
2,@swonderlin Can not wait for #iPad 2 also. The...,iPad,Positive emotion
3,@sxsw I hope this year's festival isn't as cra...,iPad or iPhone App,Negative emotion
4,@sxtxstate great stuff on Fri #SXSW: Marissa M...,Google,Positive emotion
5,@teachntech00 New iPad Apps For #SpeechTherapy...,,No emotion toward brand or product
6,,,No emotion toward brand or product
7,"#SXSW is just starting, #CTIA is around the co...",Android,Positive emotion
8,Beautifully smart and simple idea RT @madebyma...,iPad or iPhone App,Positive emotion
9,Counting down the days to #sxsw plus strong Ca...,Apple,Positive emotion


Unnamed: 0,tweet_text,emotion_in_tweet_is_directed_at,is_there_an_emotion_directed_at_a_brand_or_product
0,.@wesley83 I have a 3G iPhone. After 3 hrs twe...,iPhone,Negative emotion
1,@jessedee Know about @fludapp ? Awesome iPad/i...,iPad or iPhone App,Positive emotion
2,@swonderlin Can not wait for #iPad 2 also. The...,iPad,Positive emotion
3,@sxsw I hope this year's festival isn't as cra...,iPad or iPhone App,Negative emotion
4,@sxtxstate great stuff on Fri #SXSW: Marissa M...,Google,Positive emotion


In [70]:
sentiments.sample(5)['tweet_text']

5261    RT @mention #SXSW @mention Interactive session...
1808    Just won a @mention #sxsw mystery prize! I hop...
8347    tried installing @mention on my iphone but it ...
2539    is a bit disappointed that the two iPad talks ...
852     For those looking for HIG-like guidelines when...
Name: tweet_text, dtype: object

In [99]:
sentiments.iloc[5014]['tweet_text']

'save me some cash! TechCrunch Giveaway: An iPad 2åÊ#TechCrunch {link} via @mention #winning! #ipad2 #sxsw #apple'

In [71]:
for col in ['emotion_in_tweet_is_directed_at','is_there_an_emotion_directed_at_a_brand_or_product']:
    display(sentiments[col].value_counts())
sentiments.info()

iPad                               946
Apple                              661
iPad or iPhone App                 470
Google                             430
iPhone                             297
Other Google product or service    293
Android App                         81
Android                             78
Other Apple product or service      35
Name: emotion_in_tweet_is_directed_at, dtype: int64

No emotion toward brand or product    5389
Positive emotion                      2978
Negative emotion                       570
I can't tell                           156
Name: is_there_an_emotion_directed_at_a_brand_or_product, dtype: int64

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 9093 entries, 0 to 9092
Data columns (total 3 columns):
 #   Column                                              Non-Null Count  Dtype 
---  ------                                              --------------  ----- 
 0   tweet_text                                          9092 non-null   object
 1   emotion_in_tweet_is_directed_at                     3291 non-null   object
 2   is_there_an_emotion_directed_at_a_brand_or_product  9093 non-null   object
dtypes: object(3)
memory usage: 213.2+ KB
