In [1]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Reading the provided data

In [2]:
data = pd.read_csv(r'/kaggle/input/twitter-sentiment-dataset/Twitter_Data.csv')
data.sample(5)

data.dropna(inplace = True)
# data_df = data = data[:20000] 
data_df = data 

# Lets deep dive into the data

In [3]:
 # length of the longest string in df
data.clean_text.str.len().max()

In [4]:
data.category.value_counts().plot(kind = 'bar')

Observation: We are seeing data is not balanced but not also very implance.

## What are most frequent words

In [5]:
from nltk.probability import FreqDist
entire_text = " ".join([str(text) for text in data.clean_text.values.tolist()])
fdist = FreqDist(entire_text.split())
top_ten = fdist.most_common(10)

In [6]:
top_ten

Observation and Comments: 
* Most Frequent word is "Modi", and others are normal stopwords.
* We will have to remove the stopwords.
* We can train model with/without keyword "Modi".

# Data Cleaning

In [7]:
# Text cleaning
import nltk
from bs4 import BeautifulSoup

from nltk.corpus import stopwords
from nltk.tokenize import sent_tokenize, word_tokenize
from nltk.stem import WordNetLemmatizer 
import re
from tqdm import tqdm

def text_cleaning(text):
    # change the text into lower case.(Note: in case of social media text, it is good to leave them as it is!)
    text=text.lower()
    # removing xml tags from tweets
    text=BeautifulSoup(text, 'lxml').get_text()
    # removing URLS 
    text=re.sub('https?://[A-Za-z0-9./]+','',text)
    # removing words with "@"
    text=re.sub(r'@[A-Za-z0-9]+','',text) 
    # removing special characters
    text= re.sub(r"\W+|_", ' ', text)
    # tokenization of sentences
    text= word_tokenize(text)
    # lemmatize the text using WordNet
    lm=WordNetLemmatizer()
    words = [lm.lemmatize(word) for word in text if word not in set(stopwords.words('english'))]   
    
    return " ".join(words)

In [8]:
# data.clean_text = data['clean_text'].apply(text_cleaning)

# Data Transformation

So, we can not words directly sent normal text to any ML/DL model, so for that we need some techniques where we convert normal text in some form which is understable by models, to name a few, basics ones are BOW, TDIDF and advance such as word2vec, GloVe, fasttext.

#### TFIDF is quite simple understand, we can understand, it gives imprtance to 
1. Rare words in corpus
2. Common words in document

#### word2vec is another famous method where is trained with deep learning model.
Two types of word2vec based on core training idea:
1. CBOW: Given context word predict focus word
2. Skipgram: Given focus word predict context words

In [9]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(data.clean_text, data.category, test_size=0.25, random_state=42)
print(X_train.shape, X_test.shape)


from sklearn.feature_extraction.text import TfidfVectorizer
tfidf=TfidfVectorizer(ngram_range=(1, 1),
    max_df=1.0,
    min_df=1,)
X_train_tfidf=tfidf.fit_transform(X_train)
X_test_tfidf=tfidf.transform(X_test)

In [12]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import plot_confusion_matrix
import seaborn as sns
from sklearn.metrics import accuracy_score
model=LogisticRegression(solver='liblinear')
model.fit(X_train_tfidf, y_train)

y_pred=model.predict(X_test_tfidf)
print(accuracy_score(y_test, y_pred))

plot_confusion_matrix(model, X_test_tfidf, y_test) 

In [13]:
from sklearn.ensemble import RandomForestClassifier
model = RandomForestClassifier()
model.fit(X_train_tfidf, y_train)

y_pred=model.predict(X_test_tfidf)
print(accuracy_score(y_test, y_pred))
plot_confusion_matrix(model, X_test_tfidf, y_test) 

# Deep Learning Model: LSTM


LSTM is a Deep learning model, it is succesor of RNN, as RNN suffers with Gradiant Explosion and does not work with long sequence of text.

In [14]:
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers
from tensorflow.keras import losses
from tensorflow.keras import preprocessing
from tensorflow.keras import utils
from tensorflow.keras.layers.experimental.preprocessing import TextVectorization


from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.layers import Dense, Embedding, LSTM, SpatialDropout1D, Bidirectional, Dropout

### Transforming the data for LSTM model

In [15]:
max_features = 20000
tokenizer = Tokenizer(num_words = max_features, )
tokenizer.fit_on_texts(data_df['clean_text'].values)
X = tokenizer.texts_to_sequences(data_df['clean_text'].values)
X = pad_sequences(X, padding = 'post' ,maxlen=300)
Y = pd.get_dummies(data_df['category']).values

vocab_size = len(tokenizer.word_index)+1

### Spliting the entire data into train and test, it is being divided in 75: 25 ratio.

In [16]:
from sklearn.model_selection import train_test_split
X_train, X_test, Y_train, Y_test = train_test_split(X,Y, test_size = 0.25, random_state = 42)
print(X_train.shape,Y_train.shape)
print(X_test.shape,Y_test.shape)

Defining our model

In [17]:
embid_dim = 300   # embid_dim will give the word vector value in 300 dimensions
lstm_out = 128

# will have total 2 layers
model = keras.Sequential()
model.add(Embedding(max_features, embid_dim, input_length = X.shape[1]))
model.add(Bidirectional(LSTM(lstm_out, dropout=0.2)))
model.add(Dense(128, activation = 'relu'))
model.add(Dropout(0.5))
model.add(Dense(64, activation = 'relu'))
model.add(Dense(3, activation = 'softmax'))     # softmax for final layer
model.summary()

In [18]:
batch_size = 128
model.compile(loss = 'categorical_crossentropy', optimizer='adam',metrics = ['accuracy'])
history = model.fit(X_train, Y_train, epochs = 5, batch_size=batch_size, verbose = 1, validation_data =(X_test, Y_test))

# Predication with Test data

In [19]:
y_prob = model.predict(X_test)
y_classes = y_prob.argmax(axis=-1)

In [20]:
from sklearn.metrics import confusion_matrix
for i in range(len(list(y_classes))):
    if y_classes[i] == 0 :
        y_classes[i] = -1
    elif  y_classes[i] == 1 :
        y_classes[i] = 0
    else:
        y_classes[i] = 1
        
print(accuracy_score(y_test, y_classes)) 

In [21]:
sns.heatmap(confusion_matrix(y_test, y_classes), annot=True, fmt='d' , )