Importing Libraries and Using Kaggle API to access datasets

In [None]:
!pip install kaggle



In [None]:
!mkdir ~/.kaggle

mkdir: cannot create directory ‘/root/.kaggle’: File exists


In [None]:
cp kaggle.json ~/.kaggle/

In [None]:
!kaggle datasets download -d samanazhar/multilingual-sentiment-analysis

Downloading multilingual-sentiment-analysis.zip to /content
 99% 1.80G/1.81G [00:20<00:00, 131MB/s]
100% 1.81G/1.81G [00:20<00:00, 96.1MB/s]


In [None]:
!unzip /content/multilingual-sentiment-analysis.zip

Archive:  /content/multilingual-sentiment-analysis.zip
  inflating: PMLN_predicted_tweets.csv  
  inflating: PPP_predicted_tweets.csv  
  inflating: PTI_predicted_tweets.csv  
  inflating: Scraped_Tweets/PMLN_Complete_Dataset.csv  
  inflating: Scraped_Tweets/PPP_Complete_Dataset.csv  
  inflating: Scraped_Tweets/PTI_Complete_Dataset.csv  


In [None]:
from tensorflow.keras.callbacks import EarlyStopping
import datasets
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from sklearn.model_selection import train_test_split
import pandas as pd
import tensorflow
import transformers
import tensorflow as tf
from tensorflow.keras import layers
import re

In [None]:
ds=pd.read_csv('/content/PMLN_predicted_tweets.csv')
ds1=pd.read_csv('/content/PPP_predicted_tweets.csv')
ds2=pd.read_csv('/content/PTI_predicted_tweets.csv')

In [None]:
ds=ds[ds['language']=='in']
ds1=ds1[ds1['language']=='in']
ds2=ds2[ds2['language']=='in']

Preprocessing Data

In [None]:
ds=ds.dropna()
ds1=ds1.dropna()
ds2=ds2.dropna()
ds1 = ds1.drop(ds1[ds1['preprocessed_tweet'].apply(lambda x: isinstance(x, float))].index)
ds1 = ds.drop(ds[ds['preprocessed_tweet'].apply(lambda x: isinstance(x, float))].index)
ds1 = ds2.drop(ds2[ds2['preprocessed_tweet'].apply(lambda x: isinstance(x, float))].index)
ds.reset_index(drop=True, inplace=True)
ds1.reset_index(drop=True, inplace=True)
ds2.reset_index(drop=True, inplace=True)

In [None]:
X=pd.concat([ds['preprocessed_tweet'], ds1['preprocessed_tweet'],ds2['preprocessed_tweet']], ignore_index=True)
y= pd.concat([ds['sentiment'], ds1['sentiment'],ds2['sentiment']], ignore_index=True)
dataset = list(zip(X, y))
X, y = zip(*dataset)

In [None]:
x_train, x_test, y_train, y_test = train_test_split(X, y, test_size=0.1, random_state=10,shuffle=True)
x_train, x_val, y_train, y_val = train_test_split(x_train, y_train, test_size=0.2, random_state=21,shuffle=True)

In [None]:
def pre_process(ds):
    corpus = []
    for i in range(len(ds)):
        if isinstance(ds[i], str):
            review = re.sub('[^a-zA-Z]', ' ', ds[i])
            review = review.split(' ')
            review = [word for word in review if word != '']
            corpus.append(review)
    return corpus


In [None]:
x_train=pre_process(x_train)
x_val=pre_process(x_val)
x_test=pre_process(x_test)
tokenizer=Tokenizer()
tokenizer.fit_on_texts(X)
x_train = tokenizer.texts_to_sequences(x_train)
x_val = tokenizer.texts_to_sequences(x_val)
x_test = tokenizer.texts_to_sequences(x_test)
max_len=len(max(X,key=len))
x_train=pad_sequences(x_train, maxlen=max_len, padding='post', truncating='post')
x_test=pad_sequences(x_test, maxlen=max_len, padding='post', truncating='post')
x_val=pad_sequences(x_val, maxlen=max_len, padding='post', truncating='post')

In [None]:
vocab_size=tokenizer.word_index

RNN with Bi-LSTM Model

In [None]:
def create_sentiment_analysis_model(vocab_size, embedding_dim, maxlen):
    model = tf.keras.Sequential()
    model.add(layers.Embedding(vocab_size, embedding_dim, input_length=maxlen))
    model.add(layers.Dropout(0.5))
    model.add(layers.Bidirectional(layers.LSTM(12,return_sequences=False)))
    model.add(layers.Dropout(0.5))
    model.add(layers.Dense(64, activation='relu'))
    model.add(layers.Dropout(0.5))
    model.add(layers.Dense(1, activation='sigmoid'))
    return model


vocab_size=87554
embedding_dim = 12
maxlen = max_len

model = create_sentiment_analysis_model(vocab_size, embedding_dim, maxlen)
model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])
model.summary()

Model: "sequential_6"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding_1 (Embedding)     (None, 279, 12)           1050648   
                                                                 
 dropout_3 (Dropout)         (None, 279, 12)           0         
                                                                 
 bidirectional_1 (Bidirecti  (None, 24)                2400      
 onal)                                                           
                                                                 
 dropout_4 (Dropout)         (None, 24)                0         
                                                                 
 dense_2 (Dense)             (None, 64)                1600      
                                                                 
 dropout_5 (Dropout)         (None, 64)                0         
                                                      

In [None]:
es_callback = EarlyStopping(monitor='val_accuracy', patience=3)
model.fit(
    x_train,tensorflow.constant(y_train),
    epochs=20,
    batch_size=64,
    validation_data=(x_val,tensorflow.constant(y_val)),callbacks=[es_callback])

Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20


<keras.src.callbacks.History at 0x7945089da800>

In [None]:
model.evaluate(x_test,tensorflow.constant(y_test))



[0.14652438461780548, 0.9587349891662598]

In [None]:
sentence=["ghatiya insaan ho tum!!"]
sentence=pre_process(sentence)
sentence = tokenizer.texts_to_sequences(sentence)
sentence=pad_sequences(sentence, maxlen=max_len, padding='post', truncating='post')
output=model.predict(sentence)
threshold = 0.5
prediction = 1 if output > threshold else 0
print("Sentiment for this tweet is:",prediction)

Sentiment for this tweet is: 1


In [None]:
sentence=["i will vote for you!"]
sentence=pre_process(sentence)
sentence = tokenizer.texts_to_sequences(sentence)
sentence=pad_sequences(sentence, maxlen=max_len, padding='post', truncating='post')
output=model.predict(sentence)
threshold = 0.5
prediction = 1 if output > threshold else 0
print("Sentiment for this tweet is:",prediction)

Sentiment for this tweet is: 0
