In [2]:
import numpy as np
import pandas as pd
import tensorflow as tf
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, LSTM, Dense, Dropout
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, confusion_matrix
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer, PorterStemmer
import re
import pickle
import streamlit as st

In [3]:
def load_data(filepath):
    columns = ['target', 'id', 'date', 'flag', 'user', 'text']
    data = pd.read_csv(filepath, encoding = "latin - 1", names=columns)
    return data

In [4]:
df = load_data("training.1600000.processed.noemoticon.csv")

In [5]:
df

Unnamed: 0,target,id,date,flag,user,text
0,0,1467810369,Mon Apr 06 22:19:45 PDT 2009,NO_QUERY,_TheSpecialOne_,"@switchfoot http://twitpic.com/2y1zl - Awww, t..."
1,0,1467810672,Mon Apr 06 22:19:49 PDT 2009,NO_QUERY,scotthamilton,is upset that he can't update his Facebook by ...
2,0,1467810917,Mon Apr 06 22:19:53 PDT 2009,NO_QUERY,mattycus,@Kenichan I dived many times for the ball. Man...
3,0,1467811184,Mon Apr 06 22:19:57 PDT 2009,NO_QUERY,ElleCTF,my whole body feels itchy and like its on fire
4,0,1467811193,Mon Apr 06 22:19:57 PDT 2009,NO_QUERY,Karoli,"@nationwideclass no, it's not behaving at all...."
...,...,...,...,...,...,...
1599995,4,2193601966,Tue Jun 16 08:40:49 PDT 2009,NO_QUERY,AmandaMarie1028,Just woke up. Having no school is the best fee...
1599996,4,2193601969,Tue Jun 16 08:40:49 PDT 2009,NO_QUERY,TheWDBoards,TheWDB.com - Very cool to hear old Walt interv...
1599997,4,2193601991,Tue Jun 16 08:40:49 PDT 2009,NO_QUERY,bpbabe,Are you ready for your MoJo Makeover? Ask me f...
1599998,4,2193602064,Tue Jun 16 08:40:49 PDT 2009,NO_QUERY,tinydiamondz,Happy 38th Birthday to my boo of alll time!!! ...


In [6]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1600000 entries, 0 to 1599999
Data columns (total 6 columns):
 #   Column  Non-Null Count    Dtype 
---  ------  --------------    ----- 
 0   target  1600000 non-null  int64 
 1   id      1600000 non-null  int64 
 2   date    1600000 non-null  object
 3   flag    1600000 non-null  object
 4   user    1600000 non-null  object
 5   text    1600000 non-null  object
dtypes: int64(2), object(4)
memory usage: 73.2+ MB


In [7]:
df['text'][5]

'@Kwesidei not the whole crew '

In [8]:
df['text'][7]

"@LOLTrish hey  long time no see! Yes.. Rains a bit ,only a bit  LOL , I'm fine thanks , how's you ?"

In [9]:
df['tweet'] = df['text'].str.lower()

In [10]:
df['text'][7]

"@LOLTrish hey  long time no see! Yes.. Rains a bit ,only a bit  LOL , I'm fine thanks , how's you ?"

In [11]:
df['tweet'] = df['tweet'].apply(lambda x:re.sub(r'[^a-zA-Z\s]',"",x))

In [12]:
df['tweet'][7]

'loltrish hey  long time no see yes rains a bit only a bit  lol  im fine thanks  hows you '

In [13]:
df["tweet_tokens"] = df['tweet'].apply(lambda x:x.split())

In [14]:
df["tweet_tokens"][7]

['loltrish',
 'hey',
 'long',
 'time',
 'no',
 'see',
 'yes',
 'rains',
 'a',
 'bit',
 'only',
 'a',
 'bit',
 'lol',
 'im',
 'fine',
 'thanks',
 'hows',
 'you']

In [15]:
lemma = WordNetLemmatizer()

In [16]:
stop_words = set(stopwords.words('english'))

In [17]:
len(stop_words)

179

In [18]:
df['tweet_refine'] = df['tweet_tokens'].apply(lambda x: [word for word in x if word not in stop_words])

In [19]:
X = df['tweet_refine']

In [20]:
X

0          [switchfoot, httptwitpiccomyzl, awww, thats, b...
1          [upset, cant, update, facebook, texting, might...
2          [kenichan, dived, many, times, ball, managed, ...
3                    [whole, body, feels, itchy, like, fire]
4            [nationwideclass, behaving, im, mad, cant, see]
                                 ...                        
1599995                  [woke, school, best, feeling, ever]
1599996    [thewdbcom, cool, hear, old, walt, interviews,...
1599997                [ready, mojo, makeover, ask, details]
1599998    [happy, th, birthday, boo, alll, time, tupac, ...
1599999    [happy, charitytuesday, thenspcc, sparkscharit...
Name: tweet_refine, Length: 1600000, dtype: object

In [21]:
df['tweet_refine'] = df['tweet_refine'].apply(lambda x: [lemma.lemmatize(word) for word in x])

In [22]:
stem = PorterStemmer()

In [23]:
df['tweet_refine'] = df['tweet_refine'].apply(lambda x: [stem.stem(word) for word in x])

In [24]:
df['tweet_refine'][5634]

['cyclesoci',
 'comment',
 'neg',
 'stori',
 'httptinyurlcomcgqajm',
 'uk',
 'societi',
 'seemingli',
 'doesnt',
 'want',
 'slow']

In [25]:
y = df['target']

In [26]:
y

0          0
1          0
2          0
3          0
4          0
          ..
1599995    4
1599996    4
1599997    4
1599998    4
1599999    4
Name: target, Length: 1600000, dtype: int64

In [27]:
len(X[2452])

12

In [28]:
tokenizer = Tokenizer(num_words=100000,oov_token="<OOV>")

In [29]:
tokenizer.fit_on_texts(X)

In [30]:
X_tokenized = tokenizer.texts_to_sequences(X)

In [31]:
X_tokenized[2452]

[720, 1, 1439, 211, 3240, 6819, 26, 639, 1108, 36, 27, 85672]

In [32]:
X[2452]

['ahhh',
 'drafthouse',
 'surprise',
 'world',
 'premiere',
 'screening',
 'new',
 'star',
 'trek',
 'last',
 'night',
 'torchys']

In [33]:
X_padded = pad_sequences(X_tokenized,maxlen=50)

In [34]:
len(X_padded[534])

50

In [35]:
X_padded

array([[    0,     0,     0, ...,  9760,  1737,     4],
       [    0,     0,     0, ...,     9,   170,  1077],
       [    0,     0,     0, ...,   362,     7, 25252],
       ...,
       [    0,     0,     0, ...,  8171,   502,  1893],
       [    0,     0,     0, ..., 13591,     1,     1],
       [    0,     0,     0, ...,     1,     1,     1]])

In [36]:
def build_lstm_model(vocab_size,embedding_dim=100,max_len=50):
    model = Sequential([
    Embedding(vocab_size, embedding_dim, input_length=max_len),
    LSTM(128, return_sequences=True),
    LSTM(64),
    Dense(64, activation='relu'),
    Dropout(0.5),
    Dense(1,activation='sigmoid')
    ])
    model.compile(optimizer='adam', loss='binary_crossentropy',metrics=['accuracy'])
    return model

In [37]:
def train_model(model,X_train, X_val, y_train, y_val,epochs=5, batch_size=64):
    history = model.fit(X_train,y_train,epochs=epochs,batch_size=batch_size,validation_data=(X_val,y_val),verbose=1)
    return history

In [38]:
X_train, X_test, y_train, y_test = train_test_split(X_padded,y,test_size=0.2, random_state = 42)
X_train, X_val, y_train, y_val = train_test_split(X_train,y_train,test_size=0.2, random_state = 42)

In [39]:
vocab_size = len(tokenizer.word_index)+1

In [40]:
vocab_size

790527

In [41]:
model = build_lstm_model(vocab_size)



In [None]:
history = train_model(model, X_train, y_train, X_val, y_val)

In [45]:
model.save('sentimental_model.h5')
with open('tokenizer.pickle', 'wb') as handle:
    pickle.dump(tokenizer,handle,protocol=pickle.HIGHEST_PROTOCOL)

