In [40]:
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense,Flatten,Embedding,Activation,Dropout
from tensorflow.keras.layers import Conv1D,MaxPooling1D,GlobalMaxPooling1D
from tensorflow.keras.optimizers import Adam
from sklearn.model_selection import train_test_split

In [2]:
import numpy as np
from numpy import array
import pandas as pd

In [3]:
df=pd.read_csv('data/sentiment140.csv',encoding='latin1',header=None)

In [4]:
df=df[[5,0]]
df.columns=['twits','sentiment']
df.head()

Unnamed: 0,twits,sentiment
0,"@switchfoot http://twitpic.com/2y1zl - Awww, t...",0
1,is upset that he can't update his Facebook by ...,0
2,@Kenichan I dived many times for the ball. Man...,0
3,my whole body feels itchy and like its on fire,0
4,"@nationwideclass no, it's not behaving at all....",0


In [5]:
df0=df[df['sentiment']==0].sample(2000)
df4=df[df['sentiment']==4].sample(2000)
df=df0.append(df4).reset_index()
df=df.drop(['index'],axis=1)
df

Unnamed: 0,twits,sentiment
0,is working. Lee is making me. Boooo. Big ...,0
1,is trying to stay positive when surrounded by ...,0
2,early in the morning and my sisters are still ...,0
3,Auch my neck muscles..,0
4,Dell xps1330 is the worst notebook i have ever...,0
...,...,...
3995,@fortyoneacres Im good. You know me. Coffee an...,4
3996,@joeymcintyre @donniewahlberg I was just talki...,4
3997,my god the flipping tree surgeons are so loud!...,4
3998,@ the moooooo-vies,4


### Preprocessing and Cleaning

In [6]:
contractions={
"aren't":"are not","can't":'cannot',"couldn't":"could not","didn't":"did not","doesn't":"does not","don't":"do not",
"hadn't":"had not","hasn't":"has not","haven't":"have not","he'd ":"he would","he'll":"he will","he's":"he is",
"i'd":"i would","i'll":"i will","i'm":"i am","i've":"i have","isn't":"is not","it's":"it is","it'd":"it would",
"let's":"let us","mightn't":"might not","mustn't":"must not","shan't":"shall not","she'd":"she would","she'll":"she will","she's":"she is","shouldn't":"should not",
"that's":"that is","there's":"there is","the'd":"they would","they'll":"they will","they're":"they are","they've":"they have",
"we'd":"we would","we're":"we are","we've":"we have","weren't":"were not","what'll":"what will","what're":"what are",
"what's":"what is","what've":"what have","where's":"where is","who'd":"who would","who'll":"who will","who're":"who are",
"who's":"who is","who've":"who have","won't":"will not","wouldn't":"would not","you'd":"you would","you'll":"you will",
"you're":"you are","you've":"you have"," u ":" you "," ur ":" your "," n ":" and "
}

In [7]:
%%time
import re

text=' '.join(df['twits'])
text=text.split()
freq_com=pd.Series(text).value_counts()
rare=freq_com[freq_com.values==1]

def get_clean_text(x):
    if type(x) is str:
        x=x.lower()
        for key in contractions:
            value=contractions[key]
            x=x.replace(key,value)
        x=re.sub(r'([a-zA-Z0-9+._-]+@[a-zA-Z0-9+._-]+\.[a-zA-Z0-9+_-])','',x)
        x=re.sub(r'(http|ftp|https)://([\w_-]+(?:(?:\.[\w_-]+)+))([\w.,@?^=&%:/~+#-]*[\w@?^=&%:/~+#-])?','',x)
        x=re.sub('RT','',x)
        x=re.sub('[^A-Z a-z 0-9-]+','',x)
        x=' '.join([t for t in x.split() if t not in rare])
        return x
    else:
        return x
df['twits']=df['twits'].apply(lambda x: get_clean_text(x))

Wall time: 278 ms


In [8]:
df

Unnamed: 0,twits,sentiment
0,is working lee is making me big mean is being ...,0
1,is trying to stay when by negative i wish ever...,0
2,early in the morning and my are still not slee...,0
3,auch my neck,0
4,dell is the worst i have ever owned 2 times no...,0
...,...,...
3995,fortyoneacres im good you know me coffee and b...,4
3996,joeymcintyre donniewahlberg i was just talking...,4
3997,my god the are so loud but quite lol,4
3998,the,4


In [9]:
text=df['twits'].tolist()
text

['is working lee is making me big mean is being by cute sweet boy send blue things to him',
 'is trying to stay when by negative i wish everyone i loved was single so they could be only mine',
 'early in the morning and my are still not sleeping poor them',
 'auch my neck',
 'dell is the worst i have ever owned 2 times now no power no boot its a piece of s pofff',
 'chatcat86 poor baby',
 'hates those stupid sleep who sat next to me at 8a service i am off to get some crack in a cup before service',
 'ehssanv you r getting sick and i am the one in i should not say much it could be matter of time b4 i get it',
 'i wish i ate as well as mjyazzie i ate from a can tonight',
 'still wont work evil stuffs',
 'sighthinking about my makes me sad and they call them crushes cuz they crushes your heart',
 'djbriancua ive been so busy this past week i havent to updating music i bet the mb of lady is awesome',
 'jasedit the part was trying to internl data put it into excel and then smash it into sps

In [61]:
y=df['sentiment']

In [62]:
y

0       0
1       0
2       0
3       0
4       0
       ..
3995    4
3996    4
3997    4
3998    4
3999    4
Name: sentiment, Length: 4000, dtype: int64

In [12]:
token=Tokenizer()
token.fit_on_texts(text)

In [13]:
vocab_size=len(token.word_index)+1
vocab_size

6835

In [14]:
print(token.index_word)



In [15]:
encoded_text=token.texts_to_sequences(text)

In [16]:
print(encoded_text)

[[6, 166, 2368, 6, 232, 18, 178, 362, 6, 167, 116, 395, 396, 498, 363, 923, 228, 2, 122], [6, 193, 2, 298, 93, 116, 1685, 1, 85, 168, 1, 326, 27, 807, 19, 84, 112, 23, 136, 261], [251, 10, 3, 90, 7, 9, 22, 61, 11, 499, 275, 131], [2369, 9, 1295], [2370, 6, 3, 585, 1, 13, 238, 2371, 75, 299, 34, 37, 636, 37, 2372, 66, 5, 1064, 14, 327, 2373], [2374, 275, 262], [808, 300, 414, 111, 123, 809, 150, 2, 18, 24, 1686, 810, 1, 15, 67, 2, 39, 58, 1687, 10, 5, 547, 187, 810], [2375, 4, 282, 145, 159, 7, 1, 15, 3, 56, 10, 1, 113, 11, 183, 98, 8, 112, 23, 1688, 14, 49, 1065, 1, 39, 8], [1, 85, 1, 442, 101, 63, 101, 2376, 1, 442, 50, 5, 64, 114], [61, 500, 46, 1296, 2377], [2378, 65, 9, 283, 18, 106, 7, 84, 397, 131, 1689, 398, 84, 1689, 31, 443], [2379, 501, 83, 19, 301, 30, 586, 141, 1, 1297, 2, 1690, 328, 1, 1298, 3, 2380, 14, 924, 6, 135], [2381, 3, 378, 27, 193, 2, 2382, 2383, 213, 8, 276, 2384, 7, 91, 1299, 8, 276, 2385, 3, 415, 6, 20, 1691], [2386, 55, 10, 2, 2387, 28, 13, 2, 637, 131], [238

In [17]:
max_length=120
x=pad_sequences(encoded_text,maxlen=max_length,padding='post')

In [18]:
print(x)

[[   6  166 2368 ...    0    0    0]
 [   6  193    2 ...    0    0    0]
 [ 251   10    3 ...    0    0    0]
 ...
 [   9  264    3 ...    0    0    0]
 [   3    0    0 ...    0    0    0]
 [6834  127    4 ...    0    0    0]]


In [19]:
x.shape

(4000, 120)

### Glove Vectors

In [20]:
glove_vectors=dict()

In [21]:
%%time
file=open('data/glove.twitter.27B.25d.txt',encoding='utf-8')
for line in file:
    values=line.split()
    word=values[0]
    vectors=np.asarray(values[1:])
    glove_vectors[word]=vectors
file.close

Wall time: 10.6 s


<function TextIOWrapper.close()>

In [22]:
len(glove_vectors.keys())

1193514

In [23]:
glove_vectors.get('you')

array(['-0.41586', '0.32548', '-0.087621', '0.2018', '-0.80017',
       '-0.34418', '2.1431', '0.37188', '-0.9409', '0.24283', '-0.86396',
       '0.63858', '-6.0171', '-0.54081', '-0.43305', '0.095707',
       '0.37971', '-1.1432', '0.11382', '-0.38361', '0.41758', '0.081476',
       '-0.02659', '0.75438', '-0.77178'], dtype='<U9')

In [24]:
# 없는 단어는 error 뜬다
glove_vectors.get('you').shape

(25,)

In [29]:
word_vector_matrix=np.zeros((vocab_size,25))

In [30]:
for word,index in token.word_index.items():
    vector=glove_vectors.get(word)
    if vector is not None:
        word_vector_matrix[index]=vector
    else:
        print(word)

2
4
3
1
5
lt3
10
9
30
12
8
7
quoti
tommcfly
6
100
2nite
donniewahlberg
13
davidarchie
2day
11
2morrow
14
1st
15
b4
24
trentreznor
toooo
600
gfalcone601
jonathanrknight
6am
thedebbyryan
4th
40
230
2nd
35
730
jonasaustralia
20
3rd
3g
3gs
18th
2009
mitchelmusso
33
200
quotthe
8a
vitners
freerealms
youuuu
karliehustle
ak618
447
jackalltimelow
48
jordanknight
imanwilliams
99
1700
360
09
pauldale67
therealjordin
grrrrr
ps2
priscx
retrorewind
glbriggs
wethetravis
youuuuu
debbiefletcher
5th
eclipseapp
angelajames
esmeeworld
amoyal
andyclemmensen
140
7th
nahrain
peterfacinelli
yeeee
brianmcnugget
dexteraddict
rorothecutest
f1
mishacollins
ryankfm
frankiethesats
stephenfry
ethansuplee
kyleturman
tempoary
mandyyjirouxx
helmuts
dhughesy
lt3333
vobes
hippychikky
1k
greggarbo
quotboy
andydick
90
comixinc
pofff
chatcat86
ehssanv
mjyazzie
sighthinking
djbriancua
jasedit
internl
delphiz
myinnersexfiend
sethsimonds
vacateny
itsjustdi
foofertheimpala
elabeth
robdenbleyker
minxxym
bonnielady
blaheveryone


In [34]:
word_vector_matrix

array([[ 0.      ,  0.      ,  0.      , ...,  0.      ,  0.      ,
         0.      ],
       [-0.26079 ,  0.59108 ,  0.61622 , ...,  0.076869,  0.2284  ,
         0.2758  ],
       [ 0.28228 ,  0.019558,  0.11509 , ...,  0.2143  ,  0.25422 ,
        -0.26674 ],
       ...,
       [ 0.      ,  0.      ,  0.      , ...,  0.      ,  0.      ,
         0.      ],
       [-0.27822 ,  0.02691 ,  1.1231  , ...,  1.2278  , -0.72479 ,
         0.53962 ],
       [ 0.      ,  0.      ,  0.      , ...,  0.      ,  0.      ,
         0.      ]])

In [35]:
word_vector_matrix.shape

(6835, 25)

### TF and Keras model building

In [31]:
x

array([[   6,  166, 2368, ...,    0,    0,    0],
       [   6,  193,    2, ...,    0,    0,    0],
       [ 251,   10,    3, ...,    0,    0,    0],
       ...,
       [   9,  264,    3, ...,    0,    0,    0],
       [   3,    0,    0, ...,    0,    0,    0],
       [6834,  127,    4, ...,    0,    0,    0]])

In [36]:
x.shape

(4000, 120)

In [72]:
y

0       0
1       0
2       0
3       0
4       0
       ..
3995    4
3996    4
3997    4
3998    4
3999    4
Name: sentiment, Length: 4000, dtype: int64

In [64]:
x_train,x_test,y_train,y_test=train_test_split(x,y,test_size=0.2,random_state=42,stratify=y)

In [65]:
x_train.shape, x_test.shape, y_train.shape

((3200, 120), (800, 120), (3200,))

In [66]:
vec_size=25

model=Sequential()
model.add(Embedding(input_dim=vocab_size, output_dim=vec_size, input_length=max_length,weights=[word_vector_matrix],trainable=False))

model.add(Conv1D(64,8,activation='relu'))
model.add(MaxPooling1D(2))
model.add(Dropout(0.2))

model.add(Dense(32,activation='relu'))
model.add(Dropout(0.5))

model.add(Dense(16,activation='relu'))

model.add(GlobalMaxPooling1D())
model.add(Dense(1,activation='sigmoid'))

In [67]:
model.compile(optimizer=Adam(lr=0.0001),loss='binary_crossentropy',metrics=['accuracy'])

In [68]:
hist=model.fit(x_train,y_train,epochs=5,validation_data=(x_test,y_test))

ValueError: Failed to find data adapter that can handle input: <class 'numpy.ndarray'>, <class 'pandas.core.series.Series'>

In [69]:
def get_encoded(x):
    x=get_clean_text(x)
    x=token.texts_to_sequences(x)
    x=pad_sequences(x,maxlen=max_length,padding='post')
    return x

In [71]:
get_encoded(['hi i am lion'])

array([[345,   1,  15,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
          0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
          0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
          0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
          0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
          0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
          0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
          0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
          0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
          0,   0,   0]])