In [None]:
from google.colab import drive
drive.mount('/content/drive')

In [None]:
import pandas as pd
import numpy as np
import re
import nltk
import joblib
nltk.download('stopwords')
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer
from sklearn.linear_model import LogisticRegression
from sklearn.feature_extraction.text import CountVectorizer,TfidfVectorizer
from sklearn.svm import SVC
from sklearn.neural_network import MLPClassifier
from sklearn.metrics import accuracy_score, f1_score, average_precision_score, recall_score
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.models import Sequential
from sklearn.model_selection import train_test_split
from keras.layers import Dense, SimpleRNN, LSTM, Dropout, Bidirectional, TimeDistributed
from keras.layers.embeddings import Embedding

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


In [None]:
def datasets(path):
  df = pd.read_csv(path)
  X_text = df['text'].values
  X_title = df['title'].values
  y = df['label'].values
  return X_text, X_title, y

In [None]:
X_text1, X_title1, y1 = datasets('/content/drive/MyDrive/Group5_MLProject_EndSem/Dataset1.csv')

In [None]:
X_text2, X_title2, y2 = datasets('/content/drive/MyDrive/Group5_MLProject_EndSem/Dataset3.csv')

In [None]:
X_text3, X_title3, y3 = datasets('/content/drive/MyDrive/Group5_MLProject_EndSem/Dataset_4')

In [None]:
embeddings_dict = {}
with open("/content/drive/MyDrive/MLProject/glove.840B.300d.txt", 'r') as f:
  for line in f:
    line = line.strip()
    values = line.split()
    word = values[0]
    try:
      coefs = np.asarray(values[1:], dtype='float32')
      embeddings_dict[word] = coefs
    except ValueError:
      pass 

In [None]:
embeddings_dict = joblib.load('/content/drive/MyDrive/Group5_MLProject_EndSem/Glove_Embedding')

In [None]:
def splitting(corpus, y):
  X_train, X_test, y_train, y_test = train_test_split(corpus, y, test_size=0.3, random_state=42)
  return X_train, X_test, y_train, y_test

In [None]:
def preprocessing(X_train, X_test, embeddings_dict):
  t = Tokenizer()
  t.fit_on_texts(X_train)
  vocab_size = len(t.word_index)+1
  X_train_encoded = t.texts_to_sequences(X_train)
  X_test_encoded = t.texts_to_sequences(X_test)
  X_train_padded = pad_sequences(X_train_encoded,maxlen=150,padding='post')
  X_test_padded = pad_sequences(X_test_encoded,maxlen=150,padding='post')
  embeddings_matrix = np.zeros((vocab_size,300))
  for word,i in t.word_index.items():
    embedding_vector = embeddings_dict.get(word)
    if embedding_vector is not None:
      embeddings_matrix[i] = embedding_vector
  return embeddings_matrix, X_train_padded, X_test_padded, vocab_size

In [None]:
def BiLSTM_model(embeddings_matrix, X_train_padded, y_train, X_test_padded, y_test, vocab_size):
  model = Sequential()
  model.add(Embedding(vocab_size,300,weights=[embeddings_matrix],input_length=150,trainable=False))
  model.add(Bidirectional(LSTM(600,return_sequences=True)))
  model.add(TimeDistributed(Dense(1,activation='sigmoid')))
  model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])
  history = model.fit(X_train_padded,y_train,epochs=15,validation_data=(X_test_padded,y_test))
  return model

In [None]:
def plotting_accuracies(test_accuracy_text,test_accuracy_title,test_accuracy_tt):
  l=[test_accuracy_text,test_accuracy_title,test_accuracy_tt]
  l2=['Text','Title','Text+Title']
  d1={'Accuracy':l,'Variation':l2}
  d1=pd.DataFrame(d1)
  import plotly.express as px

  fig = px.bar(d1, y='Accuracy', x='Variation', text='Accuracy',color='Variation',title='Accuracy Analysis on Dataset1')
  fig.update_traces(texttemplate='%{text}', textposition='outside')
  fig.show()

# Dataset1

In [None]:
corpus1_text = joblib.load('/content/drive/MyDrive/Group5_MLProject_EndSem/Dataset1_Text_corpus')
corpus1_title = joblib.load('/content/drive/MyDrive/Group5_MLProject_EndSem/Dataset1_Title_corpus')

In [None]:
X_train, X_test, y_train, y_test = splitting(corpus1_text, y1)

In [None]:
embeddings_matrix, X_train_padded, X_test_padded, vocab_size = preprocessing(X_train, X_test, embeddings_dict)
model = BiLSTM_model(embeddings_matrix, X_train_padded, y_train, X_test_padded, y_test, vocab_size)

Epoch 1/15
Epoch 2/15
Epoch 3/15
Epoch 4/15
Epoch 5/15
Epoch 6/15
Epoch 7/15
Epoch 8/15
Epoch 9/15
Epoch 10/15
Epoch 11/15
Epoch 12/15
Epoch 13/15
Epoch 14/15
Epoch 15/15


In [None]:
test_loss, test_accuracy = model.evaluate(X_test_padded,y_test)
print('Accuracy: ',test_accuracy)

Accuracy:  0.9281935691833496


In [None]:
# joblib.dump(test_accuracy, '/content/drive/MyDrive/Group5_MLProject_EndSem/Accuracy_BLSTM_D1')
test_accuracy_text1 = joblib.load('/content/drive/MyDrive/Group5_MLProject_EndSem/Accuracy_BLSTM_D1')

In [None]:
X_train_title, X_test_title, y_train_title, y_test_title = splitting(corpus1_title, y1)
embeddings_matrix_title, X_train_title_padded, X_test_title_padded, vocab_size_title = preprocessing(X_train_title, X_test_title, embeddings_dict)
model_title = BiLSTM_model(embeddings_matrix_title, X_train_title_padded, y_train_title, X_test_title_padded, y_test_title, vocab_size_title)

Epoch 1/15
Epoch 2/15
Epoch 3/15
Epoch 4/15
Epoch 5/15
Epoch 6/15
Epoch 7/15
Epoch 8/15
Epoch 9/15
Epoch 10/15
Epoch 11/15
Epoch 12/15
Epoch 13/15
Epoch 14/15
Epoch 15/15


In [None]:
test_loss, test_accuracy = model_title.evaluate(X_test_title_padded, y_test_title)
print('Accuracy: ',test_accuracy)

Accuracy:  0.9177936315536499


In [None]:
# joblib.dump(test_accuracy, '/content/drive/MyDrive/Group5_MLProject_EndSem/Accuracy_BLSTM_D1_title')
test_accuracy_title1 = joblib.load('/content/drive/MyDrive/Group5_MLProject_EndSem/Accuracy_BLSTM_D1_title')

In [None]:
def concat_text_title(corpus_text, corpus_title):
  text_title = pd.DataFrame(zip(corpus_text, corpus_title))
  text_title.columns = ['text', 'title']
  text_title = text_title['text'].str.cat(text_title['title'], sep =" ")
  text_title = text_title.iloc[:].values
  return text_title

In [None]:
text_title = concat_text_title(corpus1_title, corpus1_text)

In [None]:
X_train_tt, X_test_tt, y_train_tt, y_test_tt = splitting(text_title, y1)

In [None]:
embeddings_matrix_tt, X_train_tt_padded, X_test_tt_padded, vocab_size_tt = preprocessing(X_train_tt, X_test_tt, embeddings_dict)

In [None]:
model_tt = BiLSTM_model(embeddings_matrix_tt, X_train_tt_padded, y_train_tt, X_test_tt_padded, y_test_tt, vocab_size_tt)

Epoch 1/15
Epoch 2/15
Epoch 3/15
Epoch 4/15
Epoch 5/15
Epoch 6/15
Epoch 7/15
Epoch 8/15
Epoch 9/15
Epoch 10/15
Epoch 11/15
Epoch 12/15
Epoch 13/15
Epoch 14/15
Epoch 15/15


In [None]:
test_loss, test_accuracy = model_tt.evaluate(X_test_tt_padded, y_test_tt)
print('Accuracy: ',test_accuracy)

Accuracy:  0.9387797713279724


In [None]:
# joblib.dump(test_accuracy, '/content/drive/MyDrive/Group5_MLProject_EndSem/Accuracy_BLSTM_D1_tt')
test_accuracy_tt1 = joblib.load('/content/drive/MyDrive/Group5_MLProject_EndSem/Accuracy_BLSTM_D1_tt')

In [None]:
plotting_accuracies(test_accuracy_text1,test_accuracy_title1,test_accuracy_tt1)

# Dataset2

In [None]:
corpus2_text = joblib.load('/content/drive/MyDrive/Group5_MLProject_EndSem/Dataset2_Text_corpus')
corpus2_title = joblib.load('/content/drive/MyDrive/Group5_MLProject_EndSem/Dataset2_Title_corpus')

In [None]:
X_train2, X_test2, y_train2, y_test2 = splitting(corpus2_text, y2)
embeddings_matrix2, X_train2_padded, X_test2_padded, vocab_size2 = preprocessing(X_train2, X_test2, embeddings_dict)
model2 = BiLSTM_model(embeddings_matrix2, X_train2_padded, y_train2, X_test2_padded, y_test2, vocab_size2)

Epoch 1/15
Epoch 2/15
Epoch 3/15
Epoch 4/15
Epoch 5/15
Epoch 6/15
Epoch 7/15
Epoch 8/15
Epoch 9/15
Epoch 10/15
Epoch 11/15
Epoch 12/15
Epoch 13/15
Epoch 14/15
Epoch 15/15


In [None]:
test_loss, test_accuracy = model2.evaluate(X_test2_padded,y_test2)
print('Accuracy: ',test_accuracy)

Accuracy:  0.9885739088058472


In [None]:
# joblib.dump(test_accuracy, '/content/drive/MyDrive/Group5_MLProject_EndSem/Accuracy_BLSTM_D2')
test_accuracy_text2 = joblib.load('/content/drive/MyDrive/Group5_MLProject_EndSem/Accuracy_BLSTM_D2')

In [None]:
X_train2_title, X_test2_title, y_train2_title, y_test2_title = splitting(corpus2_title, y2)
embeddings_matrix2_title, X_train2_title_padded, X_test2_title_padded, vocab_size2_title = preprocessing(X_train2_title, X_test2_title, embeddings_dict)
model2_title = BiLSTM_model(embeddings_matrix2_title, X_train2_title_padded, y_train2_title, X_test2_title_padded, y_test2_title, vocab_size2_title)

Epoch 1/15
Epoch 2/15
Epoch 3/15
Epoch 4/15
Epoch 5/15
Epoch 6/15
Epoch 7/15
Epoch 8/15
Epoch 9/15
Epoch 10/15
Epoch 11/15
Epoch 12/15
Epoch 13/15
Epoch 14/15
Epoch 15/15


In [None]:
test_loss, test_accuracy = model2_title.evaluate(X_test2_title_padded, y_test2_title)
print('Accuracy: ',test_accuracy)

Accuracy:  0.5221098065376282


In [None]:
# joblib.dump(test_accuracy, '/content/drive/MyDrive/Group5_MLProject_EndSem/Accuracy_BLSTM_D2_title')
test_accuracy_title2 = joblib.load('/content/drive/MyDrive/Group5_MLProject_EndSem/Accuracy_BLSTM_D2_title')

In [None]:
text_title2 = concat_text_title(corpus2_title, corpus2_text)

In [None]:
X_train_tt2, X_test_tt2, y_train_tt2, y_test_tt2 = splitting(text_title2, y2)
embeddings_matrix_tt2, X_train_tt2_padded, X_test_tt2_padded, vocab_size_tt2 = preprocessing(X_train_tt2, X_test_tt2, embeddings_dict)
model_tt2 = BiLSTM_model(embeddings_matrix_tt2, X_train_tt2_padded, y_train_tt2, X_test_tt2_padded, y_test_tt2, vocab_size_tt2)

Epoch 1/15
Epoch 2/15
Epoch 3/15
Epoch 4/15
Epoch 5/15
Epoch 6/15
Epoch 7/15
Epoch 8/15
Epoch 9/15
Epoch 10/15
Epoch 11/15
Epoch 12/15
Epoch 13/15
Epoch 14/15
Epoch 15/15

In [None]:
test_loss, test_accuracy = model_tt2.evaluate(X_test_tt2_padded, y_test_tt2)
print('Accuracy: ',test_accuracy)

In [None]:
joblib.dump(test_accuracy, '/content/drive/MyDrive/Group5_MLProject_EndSem/Accuracy_BLSTM_D2_tt')
test_accuracy_tt2 = joblib.load('/content/drive/MyDrive/Group5_MLProject_EndSem/Accuracy_BLSTM_D2_tt')

In [None]:
plotting_accuracies(test_accuracy_text2,test_accuracy_title2,test_accuracy_tt2)

# Datasets Accuracies Comparison and Analysis

In [None]:
import plotly.graph_objects as go
l=[test_accuracy_text1,test_accuracy_title1,test_accuracy_tt1]
l1=[test_accuracy_text2,test_accuracy_title2,test_accuracy_tt2]
l2=['Text','Title','Text+Title']
fig = go.Figure(data=[
    go.Bar(name='Dataset1', x=l2, y=l,text=l,opacity=0.98),
    go.Bar(name='Dataset2', x=l2, y=l1,text=l1,opacity=0.98)
])
fig.update_traces(texttemplate='%{text}', textposition='outside')
fig.update_layout(title_text='Accuracy Comparision on Datasets')
fig.update_layout(barmode='group')
fig.show()