In [None]:
from google.colab import drive
drive.mount('/content/drive/')

Drive already mounted at /content/drive/; to attempt to forcibly remount, call drive.mount("/content/drive/", force_remount=True).


In [None]:
#Library to perform operations
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

#Library to perform train test split
from sklearn.model_selection import train_test_split

#Library to perform PCA
from sklearn.decomposition import PCA
from sklearn import preprocessing 
#Library to calculate accuracy
from sklearn.metrics import accuracy_score

#Library to load data
import re
#Library to perform text preprocessing
from nltk.corpus import stopwords
import nltk
nltk.download('stopwords')
import string

#Library for RNN neural networks
from numpy import array
from numpy import asarray
from numpy import zeros

#Keras Layers
from keras.preprocessing.sequence import pad_sequences
from keras.layers.recurrent import SimpleRNN
from keras.layers import Bidirectional
from keras.layers import Dense
from keras.models import Sequential
from keras.layers import GlobalMaxPooling1D
from keras.layers.embeddings import Embedding
from sklearn.model_selection import train_test_split
from keras.preprocessing.text import Tokenizer

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


### **A part**

**Downloading and Loading the Dataset**

In [None]:
#Loading the Dataset
data = []
with open('/content/ML_A6_Q2_data.txt') as fh:
    for line in fh:
      data.append(re.split('\t|\n',line))
data

FileNotFoundError: ignored

In [None]:
#Making a dataframe from the loaded data
data_2 = pd.DataFrame(data, columns = ['Sentence','Tag','d'])
data_2 = data_2.drop(['d'],axis = 1)
print(data_2)

In [None]:
print(data_2.shape)

In [None]:
data_2.head()

In [None]:
#Countplot of the 0 and 1 labels
import seaborn as sns

sns.countplot(x='Tag', data=data_2)

### **B part**

**Preprocess the data**

In [None]:
def text_preprocess(message):
    #Checking if the characters are in punctuation
    punc_remove = [char for char in message if char not in string.punctuation]
    #Forming the String again by joining the letters
    punc_remove = ''.join(punc_remove)
    #Making the string to lower case after removing punctuations
    lower_str = ""
    for word in punc_remove.split():
      lower_str += word.lower() + " "

    #Removing the Stopwords
    for word in lower_str.split():
      return [word for word in lower_str.split() if word not in stopwords.words('english')]

In [None]:
#Preprocessing the Data
for i in range(data_2.shape[0]):
  data_2['Sentence'][i] = text_preprocess(data_2['Sentence'][i])
print("Text Preprocessed Data")
print(data_2)

**Splitting into train and test data**

In [None]:
#Dividing the Data into X and Y values
data_x = data_2.iloc[:,:-1]
data_y = data_2.iloc[:,-1]
print(data_x)
print(data_y)

In [None]:
#Dividing the data into train and test with 70:30
X_train, X_test, y_train, y_test = train_test_split(data_x, data_y, test_size=0.30, random_state=None)

In [None]:
X_train = X_train.reset_index(drop=True)
y_train = y_train.reset_index(drop=True)
X_test = X_test.reset_index(drop = True)
y_test = y_test.reset_index(drop = True)

In [None]:
#Printing the X_train and X_test
print("X_train")
print(X_train)
print("\nX_test")
print(X_test)

**Getting the training and testing words vocabulary**

In [None]:
#Getting all the training words
all_words = [word for sen in X_train['Sentence'] for word in sen]
#Printing the Sentence length
all_word_length = [len(sen) for sen in X_train['Sentence']]
#Storing the maximum length
max_length = max(all_word_length)
#Storing the training vocabulary
train_vocab = set(all_words) 
train_vocab = sorted(list(train_vocab))
#Printing the training vocabulary
print(train_vocab)
print(max_length)

In [None]:
#Getting all the testing words
all_words_test = [word for sen in X_test['Sentence'] for word in sen]
#Printing the Sentence length
all_testword_length = [len(sen) for sen in X_test['Sentence']]
#Storing the maximum length of the test data
test_max_length = max(all_testword_length)
#Storing the testing vocabulary
test_vocab = set(all_words_test) 
test_vocab = sorted(list(test_vocab))
#Printing the testing vocabulary
print(test_vocab)
print(test_max_length)

**Pre-Trained GLoVe**

In [None]:
#Storing the embeddings in a dictionary
embeddings_dict = dict()
#Open the glove file
glove_file = open('/content/drive/MyDrive/glove.6B.50d.txt', encoding="utf8")
#Iterating over each line
for line in glove_file:
    #Splitting the line
    data = line.split()
    #Extracting the word from the list made of each line
    word = data[0]
    #Storing the dimensions of the word in dictionary
    word_dimension = asarray(data[1:], dtype='float32')
    embeddings_dict[word] = word_dimension

glove_file.close()

In [None]:
#Printing the embeddings for word very
print(embeddings_dict.get('very'))

**Making token and converting the sentence to respective sequences**

In [None]:
#Making the Tokens by using the tokenizer class
tokenizer = Tokenizer(num_words=len(train_vocab),lower=True, char_level=False)
#Fir the tokens according to the training data sentences
tokenizer.fit_on_texts(X_train['Sentence'])
#Converting the training words into numeric data sequences
X_train = tokenizer.texts_to_sequences(X_train['Sentence'].tolist())
#Converting the testing words into numeric data sequences
X_test = tokenizer.texts_to_sequences(X_test['Sentence'].tolist())
print("Training Sequences")
print(X_train)
print("Testing Sequences")
print(X_test)

**Padding the train and test data**

In [None]:
#Getting the vocabulary length and adding 0 because of reserved index
vocabulary_length = len(tokenizer.word_index) + 1

#Printing the tokenizer word and index dictionary
print(tokenizer.word_index.items())

#Keeping the max_length as 50
max_length = 50

#Padding the sequences of the train data
X_train = pad_sequences(X_train, padding='post', maxlen=max_length)
#Padding the sequences of the test data
X_test = pad_sequences(X_test, padding='post', maxlen=max_length)

print("\nTraining Data after Padding")
print(X_train)
print("\nTesting Data after Padding")
print(X_test)

**Making the Embedding matrix**

In [None]:
#Making an embedding matrix of vocab size*50 dimensions
embedding_matrix = zeros((vocabulary_length, 50))

#Iterating over every word in the vocabulary from training data
for word, index in tokenizer.word_index.items():
    #Getting the word dimension for every word of the vocabulary from the glove data
    word_dimension = embeddings_dict.get(word)
    #If word dimension is present
    if word_dimension is not None:
        #Store the word dimension into the matrix
        embedding_matrix[index] = word_dimension

print("The shape of the Embedding Matrix ", embedding_matrix.shape)
print("\nEmbedding Matrix")
print(embedding_matrix)

### **C part**

**Making the network architecture**

In [None]:
#Loading the Sequential model
model = Sequential()
#Adding the Embedding Layer
model.add(Embedding(vocabulary_length, 50, weights=[embedding_matrix], input_length=max_length , trainable=False))
#Adding the Bidirectional Simple RNN layer
model.add(Bidirectional(SimpleRNN(50,return_sequences = True)))
#Adding the GlobalMaxPooling1D Layer
model.add(GlobalMaxPooling1D())
#Adding the Dense layer
model.add(Dense(1, activation='sigmoid'))
#Compiling the model
model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['acc'])

In [None]:
print("The Summary of the Model")
print(model.summary())

In [None]:
#Converting the training and testing labels into array
train_list = [int(i) for i in y_train] 
test_list = [int(i) for i in y_test] 
y_train = np.array(train_list)
y_test = np.array(test_list)

In [None]:
#Fitting the models over training data and evaluating scores
trained_model = model.fit(X_train, y_train, batch_size=128, epochs=50, verbose=1,validation_split=0.2)
score_train = model.evaluate(X_train, y_train, verbose=1)
score_test = model.evaluate(X_test, y_test, verbose=1)

### **D part**

In [None]:
print("Training Loss:", score_train[0])
print("Training Accuracy:", score_train[1])
print("Validation Loss:", score_test[0])
print("Validation Accuracy:", score_test[1])

**Accuracy Plot**

In [None]:
#Plot the accuracy over training and testing data with epoch
plt.plot(trained_model.history['acc'])
plt.plot(trained_model.history['val_acc'])

plt.title('Accuracy of Model')
plt.ylabel('Accuracy')
plt.xlabel('Epochs')
plt.legend(['train','test'], loc='upper left')
plt.show()


**Loss Plot**

In [None]:
#Plot the loss over training and testing data with epoch
plt.plot(trained_model.history['loss'])
plt.plot(trained_model.history['val_loss'])

plt.title('Loss of Model')
plt.ylabel('Loss')
plt.xlabel('Epochs')
plt.legend(['train','test'], loc='upper left')
plt.show()

In [None]:
predictions = model.predict(X_test, batch_size=1024, verbose=1)

In [None]:
predictions

In [None]:
prediction_labels=[]
for p in predictions:
    prediction_labels.append(list(np.around(p)))
import itertools
flat=itertools.chain.from_iterable(prediction_labels)

In [None]:
flat = list(flat)

In [None]:
flat

In [None]:
print(accuracy_score(flat,y_test))

In [None]:
# model = Sequential()
# embedding_layer = Embedding(vocab_size, 50, weights=[embedding_matrix], input_length=maxlen , trainable=False)
# model.add(embedding_layer)
# model.add(LSTM(128))
# model.add(SimpleRNN(units = 100, activation='relu',use_bias=True))
# #model.add(Dense(1, activation='sigmoid'))
# model.add(Dense(units=1000, input_dim = 2000, activation='sigmoid'))
# model.add(Dense(units=500, input_dim=1000, activation='relu'))
# model.add(Dense(units=2, input_dim=500,activation='softmax'))
# model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['acc'])