# Sentiment analysis
1. Task type: NLP
2. Dataset: Tweets (Text type)
3. Usecases: Social Media management, Review Systems, News Analysis for Stock Markets

In [1]:
import pandas as pd
import numpy as np

In [8]:
# df = pd.read_csv('./data/train.csv', encoding='latin-1', names=['Target', 'TweetID', 'Date', 'No_Query', 'UserName', 'Data'])
# df.head()

Observation: 
1. Data is not utf-8 encoded that is why required to set correct encoding to read csv file.
2. Data is not having column names that is why provided it with column name.

In [9]:
# df.info()

As it is visible to me that Target and Data are only columns useful for me to train model for sentiment detection, I can drop other columns.

In [10]:
# df = df[['Target', 'Data']]
# df.head()

Now, I will try to remove words which usually does not contribute to sentiments like tags (@username in data in tweet) and urls. I will keep hashtags as of now to check if they make any effect on data or not. 

In [11]:
# df['Data'] = df['Data'].replace(r'http\S+', '', regex=True).replace(r'@\S+', '', regex=True)

In [12]:
# df.head(20)

Data info showing data is not having null values and datatypes are int64 or objects. Now I need to determine language of text for each statement as I want my model to get trained for english only. 

In [13]:
# from langdetect import detect

In [14]:
# from numpy import NaN


# for i in range(len(df)):
#     if df['Data'][i].isspace() == True:
#         df['Data'][i] = NaN

In [15]:
# df = df[df['Data'].noatna()]

In [16]:
# df = df.reset_index(drop=True)

In [17]:
# for i in range(len(df)):
#     try:
#         detect(df['Data'][i])
#     except:
#         print(i)

In [18]:
# import string
# for char in string.punctuation:
#     df['Data'] = df['Data'].replace(char, NaN, regex=False)

In [19]:
# df.head()

In [20]:
# df['ln']=[0]*len(df)
# print(df.head())
# for i in range(len(df)):
#     try:
#         x = detect(df['Data'][i])
#         df['ln'][i] = x
#     except:
#         df['ln'][i]=NaN

# for i in range(len(df)):
#     if df['ln'][i]=='en':
#         df['ln'][i]='en'
#     else:
#         df['ln'][i]=NaN

In [21]:
# df.head(20)

In [22]:
# df.to_csv('./data/trainModified.csv')

In [2]:
df = pd.read_csv('./data/trainModified.csv')
df.head(10)

Unnamed: 0.2,Unnamed: 0,Unnamed: 0.1,Target,Data,ln
0,0,0,0,"- Awww, that's a bummer. You shoulda got Da...",en
1,1,1,0,is upset that he can't update his Facebook by ...,en
2,2,2,0,I dived many times for the ball. Managed to s...,en
3,3,3,0,my whole body feels itchy and like its on fire,en
4,4,4,0,"no, it's not behaving at all. i'm mad. why am...",en
5,5,5,0,not the whole crew,en
6,6,6,0,Need a hug,en
7,7,7,0,"hey long time no see! Yes.. Rains a bit ,onl...",en
8,8,8,0,nope they didn't have it,en
9,9,9,0,que me muera ?,


In [3]:
df = df[df.ln == 'en']
# df = df.drop(df.iloc[:, 0:1], axis=1)
df = df.drop(['Unnamed: 0', 'Unnamed: 0.1'], axis=1)
df = df.reset_index()
df.head(10)

Unnamed: 0,index,Target,Data,ln
0,0,0,"- Awww, that's a bummer. You shoulda got Da...",en
1,1,0,is upset that he can't update his Facebook by ...,en
2,2,0,I dived many times for the ball. Managed to s...,en
3,3,0,my whole body feels itchy and like its on fire,en
4,4,0,"no, it's not behaving at all. i'm mad. why am...",en
5,5,0,not the whole crew,en
6,6,0,Need a hug,en
7,7,0,"hey long time no see! Yes.. Rains a bit ,onl...",en
8,8,0,nope they didn't have it,en
9,10,0,spring break in plain city... it's snowing,en


In [4]:
len(df[df.Target==0])/len(df[df.Target==4])

1.0135628243231949

It is now clearly visible that data is pretty cleaned and sample is almost of similar length so we can train model without thinking much about bias due to unbalanced data. Data length ratio of 1.01 suggests almost same length of both targets.

Reviews suggests that transformers are doing exceptional in language related tasks so I tried to find out how transformers can be applied for sentiment analysis. BERT is the model I was looking for. 
Implementing BERT with transformers and tensorflow.

Tokenizer prepares text inputs for model. We are loading pretrained weights for BertTokenizer from pretrained model named "bert-base-cased".
Major usecase: question - answers, token classification etc.

In [26]:
# import os

# counterPos = 1
# counterNeg = 2
# main_dir = './data'
# train = os.path.join(main_dir, 'train')
# pos = os.path.join(train, 'pos')
# neg = os.path.join(train, 'neg')
# file_path = os.path.join(pos, f"{counterPos}.txt")

# for i in range(len(df)):
#     try:
#         if df['Target'][i]!=0:
#             file_path = os.path.join(pos, f"{counterPos}.txt")
#             file = open(file_path, 'w')
#             file.write(df['Data'][i])
#             counterPos+=1
#         else:
#             file_path = os.path.join(neg, f"{counterNeg}.txt")
#             file = open(file_path, 'w')
#             file.write(df['Data'][i])
#             counterNeg+=1
#     except:
#         pass

In [5]:
df = df[['Data','Target']]
df.head()

Unnamed: 0,Data,Target
0,"- Awww, that's a bummer. You shoulda got Da...",0
1,is upset that he can't update his Facebook by ...,0
2,I dived many times for the ball. Managed to s...,0
3,my whole body feels itchy and like its on fire,0
4,"no, it's not behaving at all. i'm mad. why am...",0


In [6]:
df['Target'].value_counts()

0    739687
4    729789
Name: Target, dtype: int64

In [7]:
tweet = df.Data.values


In [8]:
from tensorflow.keras.preprocessing.text import Tokenizer

tokenizer = Tokenizer(num_words=5000)

tokenizer.fit_on_texts(tweet)



In [9]:
encoded_docs = tokenizer.texts_to_sequences(tweet)

In [10]:
from tensorflow.keras.preprocessing.sequence import pad_sequences

padded_sequence = pad_sequences(encoded_docs, maxlen=500)

In [11]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import LSTM,Dense, Dropout, SpatialDropout1D
from tensorflow.keras.layers import Embedding

vocab_size = len(tokenizer.word_index) + 1
embedding_vector_length = 32
model = Sequential()
model.add(Embedding(vocab_size, embedding_vector_length, input_length=500))
model.add(SpatialDropout1D(0.25))
model.add(LSTM(50, dropout=0.5, recurrent_dropout=0.5))
model.add(Dropout(0.2))
model.add(Dense(1, activation='sigmoid'))
model.compile(loss='binary_crossentropy',optimizer='adam', metrics=['accuracy'])

print(model.summary())

Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding (Embedding)       (None, 500, 32)           9189888   
                                                                 
 spatial_dropout1d (SpatialD  (None, 500, 32)          0         
 ropout1D)                                                       
                                                                 
 lstm (LSTM)                 (None, 50)                16600     
                                                                 
 dropout (Dropout)           (None, 50)                0         
                                                                 
 dense (Dense)               (None, 1)                 51        
                                                                 
Total params: 9,206,539
Trainable params: 9,206,539
Non-trainable params: 0
______________________________________________

In [12]:
sentiment_label = df.Target.factorize()

In [13]:
history = model.fit(padded_sequence,sentiment_label[0],validation_split=0.5, epochs=2, batch_size=500)

Epoch 1/2

KeyboardInterrupt: 