In [76]:
# Implement following sequence based deep learning models for the same task of sentiment analysis.
# Perform binary text classification: RNN, GRU, LSTM , BiLSTM

# You can implement these models in Keras or Pytorch.
#Split the data into train and test set. Use 75% for training and 25% for testing.
# For each of these models, try following hyper parameters and report the best results with parameter values.

# Number of layers = 2 or 3.
# Dropout rate, 0.3 or 0.7
# So you will have 2 *2 = 4 different sets of parameters.

# Calculate accuracy, Precision, Recall and F-score for all classifiers and report the results in table.
# Also report parameter values which were used to get the results.

import numpy as np
import pandas as pd
from tensorflow.keras.preprocessing.sequence import pad_sequences
from keras.preprocessing.text import Tokenizer
from keras.models import Sequential
from keras.layers import Dense, Embedding, SimpleRNN, GRU, LSTM, Bidirectional
from keras.layers import Flatten, Dropout
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from tensorflow.keras.preprocessing import sequence

In [78]:
#Loading data
df = pd.read_csv('/content/drive/MyDrive/Data Science Internship/week_2/Day_2/urdu-sentiment-corpus-v1.tsv', sep='\t', encoding='utf-8')

In [41]:
df.head()

Unnamed: 0,Tweet,Class
0,میں نے ایٹم بم بنایا ھے ۔۔۔۔او بھائی ایٹم بمب ...,P
1,چندے سے انقلاب اور عمران خان وزیر اعظم نہیں بن...,N
2,ٹویٹر کا خیال کیسے آیا ؟,O
3,"سرچ انجن گوگل کے نائب صدر نے فضا میں ، 130,000...",P
4,ابھی تک اسکی لہریں کبھی کبھی آ جاتی ہیں یار :أْ,P


In [83]:
df.describe()

Unnamed: 0,Tweet,Class
count,1000,999
unique,999,3
top,اللہ جانے وے ماہی تیرا پیار کی اے دل دی اوداسی...,N
freq,2,499


In [54]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000 entries, 0 to 999
Data columns (total 2 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   Tweet   1000 non-null   object
 1   Class   999 non-null    object
dtypes: object(2)
memory usage: 15.8+ KB


In [79]:
#Converting to list the values in file:
Tweets = df['Tweet'].tolist()
Classes  = df['Class'].tolist()

numFreqwords = 5000      # Number of most frequent words to consider
maxSeqwords  =  500      # Maximum sequence length

tokenizer = Tokenizer(num_words=numFreqwords)
tokenizer.fit_on_texts(Tweets)

sequences = tokenizer.texts_to_sequences(Tweets)

x = pad_sequences(sequences, maxlen=maxSeqwords)
y = np.array(Classes)

np.random.seed(42)

indices = np.random.permutation(len(x))
x = x[indices]
y = y[indices]

In [80]:
#Changing the class labels in file to binary form:
#0 for N and 1 for P
y[y=='N']=0
y[y=='P']=1
y[y=='O']=0
y[y=='nan']=0

#Now Converting to int type
y= y.astype(int)

trainSize = int(0.75 * len(x)) #75% data for training

#Saving the correct portion of data for training and testing
x_train, x_test = x[:trainSize], x[trainSize:]
y_train, y_test = y[:trainSize], y[trainSize:]

# Different hyper parameters as mentioned in question
diffLayers = [2, 3]
dropoutRates = [0.3, 0.7]
modelType = ['RNN', 'GRU', 'LSTM', 'BiLSTM']

In [81]:
def createModel(numOFlayers, dropoutRate, mod):

    model = Sequential()

    # Adding Embedding layer
    model.add(Embedding(5000, 32, input_length=maxSeqwords))

    # Dealing with Recurrent layers
    for i in range(numOFlayers):
        if mod == 'RNN':      # Recurrent Neural Network
            model.add(SimpleRNN(32, return_sequences=True))
        elif mod == 'GRU':    # Gated Recurrent Unit
            model.add(GRU(32, return_sequences=True))
        elif mod == 'LSTM':   # Long short-term memory
            model.add(LSTM(32, return_sequences=True))
        elif mod == 'BiLSTM': # Bidirectional Long Short-Term Memory
            model.add(Bidirectional(LSTM(32, return_sequences=True)))

        model.add(Dropout(dropoutRate))

    model.add(Flatten())                        #Adding flatten layer
    model.add(Dense(1, activation='sigmoid'))   #Adding output layer

    # Compile the model
    model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])

    return model


results = [] #This will save all the results of each iteration

In [82]:
#Running different combinations of parameters:

for layers in diffLayers:
    for rate in dropoutRates:
        for models in modelType:

            # Creating and compiling the model by calling the function above called createModel
            model = createModel(layers, rate, models)

            # Training the model
            #Keeping the epochs set to 3
            model.fit(x_train, y_train, validation_data=(x_test, y_test), epochs=3, batch_size=64)

            # Making predictions:
            y_pred_probs = model.predict(x_test)
            y_pred = np.round(y_pred_probs).flatten().astype(int)

            # Calculating evaluation metrics according to ones instructed in question:
            accuracy = accuracy_score(y_test, y_pred)
            precision = precision_score(y_test, y_pred)
            recall = recall_score(y_test, y_pred)
            f1 = f1_score(y_test, y_pred)

            # Saving the results by appending dictionary type results for each iteration
            results.append({
                      'Layers': layers,
                      'Dropout Rate': rate,
                      'Model Type': models,
                      'Accuracy': accuracy,
                      'Precision': precision,
                      'Recall': recall,
                      'F1-score': f1
                           })



# Printing the results in table form as instructed in question
print("\nResults after testing different parameters are as follows:")
print("Layers\tDropout Rate\tModel Type\tAccuracy\tPrecision\tRecall\t\tF1-score")
for result in results:
    print(f"{result['Layers']}\t{result['Dropout Rate']}\t\t{result['Model Type']}\t\t{result['Accuracy']:.4f}\t\t{result['Precision']:.4f}\t\t{result['Recall']:.4f}\t\t{result['F1-score']:.4f}")


Epoch 1/3
Epoch 2/3
Epoch 3/3
Epoch 1/3
Epoch 2/3
Epoch 3/3
Epoch 1/3
Epoch 2/3
Epoch 3/3
Epoch 1/3
Epoch 2/3
Epoch 3/3
Epoch 1/3
Epoch 2/3
Epoch 3/3
Epoch 1/3
Epoch 2/3
Epoch 3/3
Epoch 1/3
Epoch 2/3
Epoch 3/3


  _warn_prf(average, modifier, msg_start, len(result))


Epoch 1/3
Epoch 2/3
Epoch 3/3
Epoch 1/3
Epoch 2/3
Epoch 3/3
Epoch 1/3
Epoch 2/3
Epoch 3/3
Epoch 1/3
Epoch 2/3
Epoch 3/3
Epoch 1/3
Epoch 2/3
Epoch 3/3
Epoch 1/3
Epoch 2/3
Epoch 3/3
Epoch 1/3
Epoch 2/3
Epoch 3/3


  _warn_prf(average, modifier, msg_start, len(result))


Epoch 1/3
Epoch 2/3
Epoch 3/3
Epoch 1/3
Epoch 2/3
Epoch 3/3

Results after testing different parameters are as follows:
Layers	Dropout Rate	Model Type	Accuracy	Precision	Recall		F1-score
2	0.3		RNN		0.5760		0.5000		0.0189		0.0364
2	0.3		GRU		0.4240		0.4240		1.0000		0.5955
2	0.3		LSTM		0.5800		1.0000		0.0094		0.0187
2	0.3		BiLSTM		0.5720		0.4912		0.2642		0.3436
2	0.7		RNN		0.4320		0.4237		0.9434		0.5848
2	0.7		GRU		0.5440		0.4355		0.2547		0.3214
2	0.7		LSTM		0.5760		0.0000		0.0000		0.0000
2	0.7		BiLSTM		0.4240		0.4240		1.0000		0.5955
3	0.3		RNN		0.5760		0.5000		0.0094		0.0185
3	0.3		GRU		0.4240		0.4240		1.0000		0.5955
3	0.3		LSTM		0.4240		0.4240		1.0000		0.5955
3	0.3		BiLSTM		0.5800		1.0000		0.0094		0.0187
3	0.7		RNN		0.5760		0.5000		0.0189		0.0364
3	0.7		GRU		0.5760		0.0000		0.0000		0.0000
3	0.7		LSTM		0.4240		0.4240		1.0000		0.5955
3	0.7		BiLSTM		0.5760		0.0000		0.0000		0.0000


  _warn_prf(average, modifier, msg_start, len(result))
