In [23]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

%matplotlib inline


In [24]:
df=pd.read_csv('sentiment140_cleaned.csv')
df.head()

Unnamed: 0,target,id,date,flag,user,text,cleaned_text
0,0,1467810369,Mon Apr 06 22:19:45 PDT 2009,NO_QUERY,_TheSpecialOne_,"@switchfoot http://twitpic.com/2y1zl - Awww, t...",switchfoot httptwitpiccom2y1zl awww thats bumm...
1,0,1467810672,Mon Apr 06 22:19:49 PDT 2009,NO_QUERY,scotthamilton,is upset that he can't update his Facebook by ...,upset cant update facebook texting might cry r...
2,0,1467810917,Mon Apr 06 22:19:53 PDT 2009,NO_QUERY,mattycus,@Kenichan I dived many times for the ball. Man...,kenichan dived many times ball managed save 50...
3,0,1467811184,Mon Apr 06 22:19:57 PDT 2009,NO_QUERY,ElleCTF,my whole body feels itchy and like its on fire,whole body feels itchy like fire
4,0,1467811193,Mon Apr 06 22:19:57 PDT 2009,NO_QUERY,Karoli,"@nationwideclass no, it's not behaving at all....",nationwideclass behaving im mad cant see


In [25]:
df=df.drop(['id','date','flag','user','cleaned_text'],axis=1)

In [26]:
df.head()

Unnamed: 0,target,text
0,0,"@switchfoot http://twitpic.com/2y1zl - Awww, t..."
1,0,is upset that he can't update his Facebook by ...
2,0,@Kenichan I dived many times for the ball. Man...
3,0,my whole body feels itchy and like its on fire
4,0,"@nationwideclass no, it's not behaving at all...."


In [28]:
import nltk
from nltk.corpus import stopwords
nltk.download('stopwords')


[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\user\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [29]:
from nltk.stem import WordNetLemmatizer
nltk.download('wordnet')

lemmatizer=WordNetLemmatizer()

[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\user\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [30]:
import re

stop_word=set(stopwords.words('english'))
def clean_txt(text):
    text=re.sub(r'http\S+',' ',text)
    text=re.sub(r'<.*?>',' ',text)
    text=re.sub(r'[^a-z\s]',' ',text)
    text=text.lower().split()
    text=[lemmatizer.lemmatize(word) for word in text if word not in stop_word]
    return ' '.join(text)


df['clean_text']=df['text'].apply(clean_txt)
corpus=df['clean_text'].tolist()

In [31]:
df.head()

Unnamed: 0,target,text,clean_text
0,0,"@switchfoot http://twitpic.com/2y1zl - Awww, t...",switchfoot www bummer ou shoulda got avid arr ...
1,0,is upset that he can't update his Facebook by ...,upset update acebook texting might cry result ...
2,0,@Kenichan I dived many times for the ball. Man...,enichan dived many time ball anaged save rest ...
3,0,my whole body feels itchy and like its on fire,whole body feel itchy like fire
4,0,"@nationwideclass no, it's not behaving at all....",nationwideclass behaving mad see


In [32]:
df.drop('text',axis=1)

Unnamed: 0,target,clean_text
0,0,switchfoot www bummer ou shoulda got avid arr ...
1,0,upset update acebook texting might cry result ...
2,0,enichan dived many time ball anaged save rest ...
3,0,whole body feel itchy like fire
4,0,nationwideclass behaving mad see
...,...,...
1599995,1,ust woke aving school best feeling ever
1599996,1,com ery cool hear old alt interview
1599997,1,ready akeover sk detail
1599998,1,appy th irthday boo alll time upac maru hakur


In [33]:
import tensorflow as tf

from tensorflow.keras.preprocessing.text import Tokenizer
tokenizer=Tokenizer()
tokenizer.fit_on_texts(corpus)

In [34]:
tokenized_text=tokenizer.texts_to_sequences(corpus)

In [35]:
max_len=max(len(seq) for seq in tokenized_text)

In [36]:
max_len

50

In [37]:
from tensorflow.keras.preprocessing.sequence import pad_sequences
x=pad_sequences(tokenized_text,maxlen=max_len,padding='post')

In [38]:
from sklearn.model_selection import train_test_split
x_train,x_test,y_train,y_test=train_test_split(x,df['target'],test_size=0.30,random_state=42)

In [39]:
max_vocab=len(tokenizer.word_index)+1
max_vocab

493970

In [40]:
max_len1=x_train.shape[1]
max_len1

50

In [41]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense,Dropout,LSTM,Embedding,Bidirectional

model=Sequential()
model.add(Embedding(max_vocab,output_dim=128,input_length=max_len1))
model.add(Bidirectional(LSTM(150,return_sequences=False)))
model.add(Dropout(0.5))
model.add(Dense(1,activation='sigmoid'))

In [43]:
model.summary()

Model: "sequential_1"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding_1 (Embedding)     (None, 50, 128)           63228160  
                                                                 
 bidirectional (Bidirectiona  (None, 300)              334800    
 l)                                                              
                                                                 
 dropout_1 (Dropout)         (None, 300)               0         
                                                                 
 dense_1 (Dense)             (None, 1)                 301       
                                                                 
Total params: 63,563,261
Trainable params: 63,563,261
Non-trainable params: 0
_________________________________________________________________


In [44]:
from tensorflow.keras.callbacks import EarlyStopping

early_stop=EarlyStopping(monitor='val_loss',patience=2,restore_best_weights=True)
model.compile(optimizer='Adam',loss='binary_crossentropy',metrics=['accuracy'])
history=model.fit(x_train,y_train,batch_size=64,epochs=50,shuffle=True,validation_split=0.1,callbacks=[early_stop],verbose=1)

Epoch 1/50
Epoch 2/50
Epoch 3/50


In [45]:
model.save('sentiment_model.h5')

In [48]:
import joblib

joblib.dump(tokenizer,'tokenizer.pkl')



['tokenizer.pkl']