In [1]:
#Importing libraries used in these tasks

import pandas
import numpy 
pandas.options.mode.chained_assignment = None
pandas.set_option('display.max_colwidth', None)

from scipy.stats import skew, kurtosis

import matplotlib.pyplot as plot
import seaborn as sns

import warnings
warnings.filterwarnings("ignore", category=DeprecationWarning)

import sklearn
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score

# !pip install keras
# !pip install tensorflow
from keras.models import Sequential
from keras.layers import Embedding, LSTM, GRU, SimpleRNN, Bidirectional, Dense, Dropout
from keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

In [2]:
senti = pandas.read_csv(r'C:\Users\USER\Documents\Data Science Internship\data\urdu-sentiment-corpus-v1.tsv', sep='\t')

#Drop the O class because the task require us to do a binary classification
senti = senti[senti['Class'] != 'O']
senti.reset_index(drop=True, inplace=True)

senti

Unnamed: 0,Tweet,Class
0,میں نے ایٹم بم بنایا ھے ۔۔۔۔او بھائی ایٹم بمب کوٹ لکھپت والی اتفاق فیکٹری میں نہیں بنتا۔ایٹم بم کہوٹہ کی ایٹمی۔۔۔,P
1,چندے سے انقلاب اور عمران خان وزیر اعظم نہیں بن سکتے,N
2,"سرچ انجن گوگل کے نائب صدر نے فضا میں ، 130,000 فٹ کی بلندی پر چھلانگ لگا کر عالمی ریکارڈ قائم کرلیا۔ چھلانگ کی۔۔۔",P
3,ابھی تک اسکی لہریں کبھی کبھی آ جاتی ہیں یار :أْ,P
4,گندی زبان اور گٹر جیسے دماغ والے جاهل جیالے هو تم۔ جیالا هو اور جاهل نه هو یه ممکن نهیں ۔,N
...,...,...
975,اُس آدمی نے اِس سالار کو کافی معقول ٹپ دی ہے ۔,P
976,چچا غالب کی روح سے معذرت کے ساتھہم نے مانا کہ کچھ نہیں ڈیزلمفت ہاتھ آئے تو برا کیا ہے,P
977,واہ جناب واہ! اچھی رہی۔ جناب خود کو فرشتہ سمجوں وہ انسان ہیں,P
978,اسلام آباد :پی اے ٹی کا دھرنا ختم، صفائی کے کام کا آغاز ، روزنامہ اُردو پوائنٹ,P


In [3]:
#Checking for null or missing values

print("Missing values: \n")
print(senti.isnull().sum())

if senti.isnull().sum().sum() > 0:
    for inst in senti['Class'].isnull():
        if inst == 1:
            senti['Class'].fillna(senti['Class'].mode()[0], inplace=True)
            
print("\nMissing values: \n")
print(senti.isnull().sum())

Missing values: 

Tweet    0
Class    1
dtype: int64

Missing values: 

Tweet    0
Class    0
dtype: int64


In [4]:
tweet = senti['Tweet']
category = senti['Class']

trainData, testData, trainTarget, testTarget = train_test_split(tweet, category, test_size=0.25, random_state=15, stratify=senti["Class"])

#Creating tokens of Urdu sentences and their total count in the dataset
tokens = Tokenizer()
tokens.fit_on_texts(trainData)

token = pandas.DataFrame(tokens.word_counts.items(), columns=['word', 'count'])

token.head(8)

Unnamed: 0,word,count
0,ﮐﻤﯿﺸﻦ,1
1,ﻧﮯﻧﻨﺪﯼ,1
2,ﭘﻮﺭﺑﺠﻠﯽ,1
3,ﻣﻨﺼﻮﺑﮯﻣﯿﮟ,1
4,ﺗﺎﺧﯿﺮﮐﺎﺫﻣﮯﺩﺍﺭﻭﺯﺍﺭﺕ,1
5,ﻗﺎﻧﻮﻥ,1
6,ﮐﻮﻗﺮﺍﺭ,1
7,ﺩﯾﺎ،,1


In [5]:
#Replacing each sentence token with their respective total count in the dataset
trainSentences = tokens.texts_to_sequences(trainData)
testSentences = tokens.texts_to_sequences(testData)

print("Sentences before padding:\n", trainSentences[0:1])

#Add padding [0] to match each of the sentence lengths
maxLength = 0
for sentence in trainSentences:
    length = len(sentence)
    if length > maxLength:
        maxLength = length

trainSentencesPadded = pad_sequences(trainSentences, maxlen=maxLength)
testSentencesPadded = pad_sequences(testSentences, maxlen=maxLength)

print("Sentences after padding:\n", trainSentencesPadded[0:1])

Sentences before padding:
 [[1440, 1441, 1442, 1443, 1444, 1445, 1446, 1447, 1448, 1449, 790, 1450, 137, 1451, 1452, 1453, 183, 1454, 1455, 790, 1456, 791, 1457]]
Sentences after padding:
 [[   0    0    0    0    0    0    0    0    0    0    0    0    0    0
  1440 1441 1442 1443 1444 1445 1446 1447 1448 1449  790 1450  137 1451
  1452 1453  183 1454 1455  790 1456  791 1457]]


In [6]:
# Mapping the categories of sentiment to numerical format
trainCategory = trainTarget.map({'P': 1, 'N': 0})
testCategory = testTarget.map({'P': 1, 'N': 0})

# 2 * 2 = 4 parameters
parameters = [[2, 0.3], [2, 0.7], [3, 0.3], [3, 0.7]]

results = pandas.DataFrame(columns = ['Model', 'Layers', 'Dropout Rate', 'Accuracy', 'Precision', 'Recall', 'F1'])

In [7]:
def evaluate_model(model, sequences, labels):
    predictions = model.predict(sequences)
    predicted_classes = numpy.round(predictions)
    accuracy = accuracy_score(labels, predicted_classes)
    precision = precision_score(labels, predicted_classes, zero_division=1)
    recall = recall_score(labels, predicted_classes)
    f1 = f1_score(labels, predicted_classes)
    return accuracy, precision, recall, f1

In [12]:
for parameter in parameters:
    #RNN Model        
    model = Sequential()
    model.add(Embedding(input_dim = len(tokens.word_index)+1, output_dim = 100, input_length = maxLength))
    for _ in range(parameter[0]):
        model.add(SimpleRNN(units = 64, dropout = parameter[1], return_sequences=True))
    model.add(SimpleRNN(units = 64, dropout = parameter[1]))
    model.add(Dense(units=1, activation='sigmoid'))

    model.compile(optimizer = 'adam', loss = 'binary_crossentropy', metrics = ['accuracy'])
    model.fit(trainSentencesPadded, trainCategory, validation_data = (testSentencesPadded, testCategory), epochs = 10, batch_size = 20, verbose = 0)

    accuracy, precision, recall, f1 = evaluate_model(model, testSentencesPadded, testCategory)
    score = ['RNN', parameter[0], parameter[1], accuracy, precision, recall, f1]
    
    temp = pandas.DataFrame([score], columns = ['Model', 'Layers', 'Dropout Rate', 'Accuracy', 'Precision', 'Recall', 'F1'])
    results = pandas.concat([results, temp])
    
    #GRU Model        
    model = Sequential()
    model.add(Embedding(input_dim = len(tokens.word_index)+1, output_dim = 100, input_length = maxLength))
    for _ in range(parameter[0]):
        model.add(GRU(units = 64, dropout = parameter[1], return_sequences=True))
    model.add(GRU(units = 64, dropout = parameter[1]))
    model.add(Dense(units=1, activation='sigmoid'))

    model.compile(optimizer = 'adam', loss = 'binary_crossentropy', metrics = ['accuracy'])
    model.fit(trainSentencesPadded, trainCategory, validation_data = (testSentencesPadded, testCategory), epochs = 15, batch_size = 20, verbose = 0)

    accuracy, precision, recall, f1 = evaluate_model(model, testSentencesPadded, testCategory)
    score = ['GRU', parameter[0], parameter[1], accuracy, precision, recall, f1]
    
    temp = pandas.DataFrame([score], columns = ['Model', 'Layers', 'Dropout Rate', 'Accuracy', 'Precision', 'Recall', 'F1'])
    results = pandas.concat([results, temp])
    
    #LSTM Model        
    model = Sequential()
    model.add(Embedding(input_dim = len(tokens.word_index)+1, output_dim = 100, input_length = maxLength))
    for _ in range(parameter[0]):
        model.add(LSTM(units = 64, dropout = parameter[1], return_sequences=True))
    model.add(LSTM(units = 64, dropout = parameter[1]))
    model.add(Dense(units=1, activation='sigmoid'))

    model.compile(optimizer = 'adam', loss = 'binary_crossentropy', metrics = ['accuracy'])
    model.fit(trainSentencesPadded, trainCategory, validation_data = (testSentencesPadded, testCategory), epochs = 10, batch_size = 20, verbose = 0)

    accuracy, precision, recall, f1 = evaluate_model(model, testSentencesPadded, testCategory)
    score = ['LSTM', parameter[0], parameter[1], accuracy, precision, recall, f1]
    
    temp = pandas.DataFrame([score], columns = ['Model', 'Layers', 'Dropout Rate', 'Accuracy', 'Precision', 'Recall', 'F1'])
    results = pandas.concat([results, temp])

    #BiLSTM Model        
    model = Sequential()
    model.add(Embedding(input_dim = len(tokens.word_index)+1, output_dim = 100, input_length = maxLength))
    for _ in range(parameter[0]):
        model.add(Bidirectional(LSTM(units = 64, dropout = parameter[1], return_sequences=True)))
    model.add(Bidirectional(LSTM(units = 64, dropout = parameter[1])))
    model.add(Dense(units=1, activation='sigmoid'))

    model.compile(optimizer = 'adam', loss = 'binary_crossentropy', metrics = ['accuracy'])
    model.fit(trainSentencesPadded, trainCategory, validation_data = (testSentencesPadded, testCategory), epochs = 10, batch_size = 20, verbose = 0)

    accuracy, precision, recall, f1 = evaluate_model(model, testSentencesPadded, testCategory)
    score = ['BiLSTM', parameter[0], parameter[1], accuracy, precision, recall, f1]
    
    temp = pandas.DataFrame([score], columns = ['Model', 'Layers', 'Dropout Rate', 'Accuracy', 'Precision', 'Recall', 'F1'])
    results = pandas.concat([results, temp])
    
results.reset_index(drop=True, inplace=True)
results



Unnamed: 0,Model,Layers,Dropout Rate,Accuracy,Precision,Recall,F1
0,BiLSTM,2,0.7,0.587755,0.581197,0.566667,0.57384
1,BiLSTM,2,0.7,0.612245,0.612613,0.566667,0.588745
2,RNN,2,0.3,0.514286,0.503759,0.558333,0.529644
3,GRU,2,0.3,0.632653,0.644231,0.558333,0.598214
4,LSTM,2,0.3,0.616327,0.606557,0.616667,0.61157
5,BiLSTM,2,0.3,0.583673,0.568182,0.625,0.595238
6,RNN,2,0.7,0.489796,0.489796,1.0,0.657534
7,GRU,2,0.7,0.608163,0.589552,0.658333,0.622047
8,LSTM,2,0.7,0.62449,0.611111,0.641667,0.626016
9,BiLSTM,2,0.7,0.62449,0.588608,0.775,0.669065


In [14]:
# As we can see BiLSTM Model at 3 layers and 0.3 dropout rate performs the best so this will be used for future predictions
parameter = [3, 0.3]

model = Sequential()
model.add(Embedding(input_dim = len(tokens.word_index)+1, output_dim = 100, input_length = maxLength))
for _ in range(parameter[0]):
    model.add(Bidirectional(LSTM(units = 64, dropout = parameter[1], return_sequences=True)))
model.add(Bidirectional(LSTM(units = 64, dropout = parameter[1])))
model.add(Dense(units=1, activation='sigmoid'))

model.compile(optimizer = 'adam', loss = 'binary_crossentropy', metrics = ['accuracy'])
model.fit(trainSentencesPadded, trainCategory, validation_data = (testSentencesPadded, testCategory), epochs = 10, batch_size = 20, verbose = 0)

accuracy, precision, recall, f1 = evaluate_model(model, testSentencesPadded, testCategory)
score = ['BiLSTM', parameter[0], parameter[1], accuracy, precision, recall, f1]

temp = pandas.DataFrame([score], columns = ['Model', 'Layers', 'Dropout Rate', 'Accuracy', 'Precision', 'Recall', 'F1'])   
temp.reset_index(drop=True, inplace=True)

temp



Unnamed: 0,Model,Layers,Dropout Rate,Accuracy,Precision,Recall,F1
0,BiLSTM,3,0.3,0.632653,0.605634,0.716667,0.656489


In [18]:
# Saving our model results for future use
modelData = {
    'tokenizer': tokens,
    'maxLength': maxLength,
    'model': model
}

import pickle
with open('urduSentiModel', 'wb') as file:
    pickle.dump(modelData, file)