In [7]:
import torch
import os
import argparse
from torch.utils.data import Dataset, DataLoader
import torchtext
from collections import Counter
import numpy as np
import pandas as pd
import pickle

In [8]:
# from google.colab import drive
# drive.mount('/content/gdrive')

In [9]:
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
nltk.download('stopwords')
nltk.download('punkt')
sw = stopwords.words('english') 


class Tokenizer:

    def __init__(self, file, threshold=5):
        self.file = file
        self.data = pd.read_csv(file)
        self.threshold = threshold

    def preprocess(self):
        tokenizer = torchtext.data.utils.get_tokenizer('spacy', language='en')
        tokens = []
        sentence_list=[]
        for text in self.data['text'].tolist():
            tokens.append(tokenizer(text))
            sentence_list.append(text.split('.'))

        self.data['sentences_list'] = sentence_list
        counter = Counter()
        for line in tokens:
            for word in line:
                counter[word] += 1
        # print(len(counter.items()), len(counter.most_common()))

        # remove all words that have frequency less than threshold
        # counter_threshold = {k:v for k,v in counter.items() if v >= self.threshold}

        # create mappings
        # mapper = {word:idx+1 for idx,word in enumerate(counter_threshold.keys())}
        # inverse_mapper = {idx+1:word for idx,word in enumerate(counter_threshold.keys())}

        # sos_idx = len(counter_threshold.keys())
        # eos_idx = len(counter_threshold.keys()) + 1
        # other_idx = len(counter_threshold.keys()) + 2

        # mapped_tokens = []

        # for line in tokens:
        #     mapped_line = [sos_idx]
        #     for word in line:
        #       # map words to their mappings and to other otherwise
        #         mapped_line.append(mapper.get(word, other_idx))
        #     mapped_line.append(eos_idx)
        #     mapped_tokens.append(mapped_line)

        # inverse_mapper[other_idx] = "__OTHER__"
        # inverse_mapper[sos_idx] = "__SOS__"
        # inverse_mapper[eos_idx] = "__EOS__"
        # inverse_mapper[0] = "__PADDING__"

        mapper = {word[0]: idx+1 for idx,
                  word in enumerate(counter.most_common())}
        inverse_mapper = {idx+1: word[0] for idx,
                          word in enumerate(counter.most_common())}

        # sos_idx = len(counter_threshold.keys())
        # eos_idx = len(counter_threshold.keys()) + 1
        other_idx = len(counter.keys())

        mapped_tokens = []

        for line in tokens:
            mapped_line = []
            for word in line:
              # map words to their mappings and to other otherwise
                mapped_line.append(mapper.get(word, other_idx))
            mapped_tokens.append(mapped_line)

        # inverse_mapper[other_idx] = "__OTHER__"
        # inverse_mapper[sos_idx] = "__SOS__"
        # inverse_mapper[eos_idx] = "__EOS__"
        # inverse_mapper[0] = "__PADDING__"

        return mapped_tokens, inverse_mapper




def similarity_paragraph(data):
    # data = self.data
    sim_list = []
    for para in data['sentences_list'].tolist():
      sim = 2000
      start = para[0]
      para = para[1:]
      for sent in para:            
        # tokenization
        X_list = word_tokenize(start) 
        Y_list = word_tokenize(sent)
          
        # sw contains the list of stopwords
        l1 =[];l2 =[]
          
        # remove stop words from the string
        X_set = {w for w in X_list if not w in sw} 
        Y_set = {w for w in Y_list if not w in sw}
          
        # form a set containing keywords of both strings 
        rvector = X_set.union(Y_set) 
        for w in rvector:
            if w in X_set: l1.append(1) # create a vector
            else: l1.append(0)
            if w in Y_set: l2.append(1)
            else: l2.append(0)
        c = 0
          
        # cosine formula 
        for i in range(len(rvector)):
            c+= l1[i]*l2[i]
        try:
          cosine = c / float((sum(l1)*sum(l2))**0.5)
          if sim > cosine:
            sim=cosine
          # sim += cosine
        except:
          sim += 0
          
        start = sent
      
      # sim = sim/(len(para)+1)
      sim_list.append(sim)
    
    data['similarity'] = sim_list

    return data



          
          # print("similarity: ", cosine)


[nltk_data] Downloading package stopwords to /home/smruti/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to /home/smruti/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [10]:
# training data
data1 = pd.read_csv('data/GCDC_Corpus_v2/GCDC_rerelease/Clinton_train.csv')
data2 = pd.read_csv('data/GCDC_Corpus_v2/GCDC_rerelease/Yahoo_train.csv')
data3 = pd.read_csv('data/GCDC_Corpus_v2/GCDC_rerelease/Yelp_train.csv')
data4 = pd.read_csv('data/GCDC_Corpus_v2/GCDC_rerelease//Enron_train.csv')

data5 = pd.read_csv('data/GCDC_Corpus_v2/GCDC_rerelease/Yahoo_test.csv')
data6 = pd.read_csv('data/GCDC_Corpus_v2/GCDC_rerelease/Yelp_test.csv')
data7 = pd.read_csv('data/GCDC_Corpus_v2/GCDC_rerelease/Enron_test.csv')


In [11]:
data = pd.concat([data1, data2, data3, data4, data5, data6, data7])

In [12]:
data.to_csv('data/GCDC_Corpus_v2/new_train.csv')

In [13]:
# train = Tokenizer("/content/gdrive/MyDrive/GCDC_rerelease/train.csv")
# test = Tokenizer("/content/gdrive/MyDrive/GCDC_rerelease/Yahoo_test.csv")

In [14]:
train = Tokenizer("data/GCDC_Corpus_v2/new_train.csv")
test = Tokenizer("data/GCDC_Corpus_v2/GCDC_rerelease/Clinton_test.csv")

In [15]:
len(train.data)
# len(test.data)

4600

In [16]:
# lst=[]
# for i in range(1000):
#   if train.data['labelA'][i]==3:
#     lst.append([0,0,1])
#   elif train.data['labelA'][i]==2:
#     lst.append([0,1,0])
#   elif train.data['labelA'][i]==1:
#     lst.append([1,0,0])

# train.data['h_e']=lst

from numpy import array
from numpy import argmax
from tensorflow.keras.utils import to_categorical

lst = array(train.data['labelA'])
encoded = to_categorical(lst)
print(encoded)
# inverted = argmax(encoded[0])
# print(inverted)

# train.data['h_e'] = encoded

[[0. 0. 1. 0.]
 [0. 0. 1. 0.]
 [0. 0. 0. 1.]
 ...
 [0. 0. 0. 1.]
 [0. 0. 1. 0.]
 [0. 1. 0. 0.]]


In [17]:
# lst=[]
# for i in range(200):
#   if test.data['labelA'][i]==3:
#     lst.append([0,0,1])
#   elif test.data['labelA'][i]==2:
#     lst.append([0,1,0])
#   elif test.data['labelA'][i]==1:
#     lst.append([1,0,0])

# test.data['h_e']=lst

lst = array(test.data['labelA'])
t_encoded = to_categorical(lst)
# test.data['h_e'] = encoded

In [18]:
from torchtext import data



In [19]:
train_mapping, inv_train_mapping = train.preprocess()
test_mapping, inv_test_mapping = test.preprocess()

# train.data = similarity_paragraph(train.data)
# test.data = similarity_paragraph(test.data)



In [20]:
# new_data = open('/content/drive/MyDrive/GCDC_rerelease/mapped_tokens_Yelp_train.csv.pkl','rb')
# new_t_data = open('/content/drive/MyDrive/GCDC_rerelease/mapped_tokens_Yelp_test.csv.pkl','rb')
# train_mapping = pickle.load(new_data)
# test_mapping = pickle.load(new_t_data)

len(train_mapping)
# train_mapping
train.data['encoding'] = train_mapping
test.data['encoding'] = test_mapping
train.data

Unnamed: 0.1,Unnamed: 0,text_id,subject,text,ratingA1,ratingA2,ratingA3,labelA,ratingM1,ratingM2,ratingM3,ratingM4,ratingM5,labelM,question_title,question,sentences_list,encoding
0,0,C05796441_2,,Cheryl:\n\nAre we in a good place to begin pap...,3,2,1,2,2,2,3,1,2,2,,,[Cheryl:\n\nAre we in a good place to begin pa...,"[527, 107, 15, 1437, 22, 10, 7, 75, 97, 4, 870..."
1,1,C05786430_1,Department of State,"Our friend, General Joe Ballard owns The Raven...",2,1,3,2,3,2,3,1,3,3,,,"[Our friend, General Joe Ballard owns The Rave...","[398, 370, 3, 1247, 2174, 9568, 3569, 29, 9569..."
2,2,C05780653_3,,Outstanding news! Miki Rakic called about 10 m...,2,3,3,3,2,2,3,2,3,3,,,[Outstanding news! Miki Rakic called about 10 ...,"[14045, 626, 37, 14046, 19230, 253, 54, 355, 3..."
3,3,C05782181_1,Libyan CG Pol Dirs mtg @ Istanbul @ 14:00 Thur...,Responding to separate emails from Uzra + Jeff...,1,2,1,1,2,2,1,3,1,1,,,[Responding to separate emails from Uzra + Jef...,"[19243, 4, 1518, 3177, 43, 14049, 1133, 581, 1..."
4,4,C05785147_0,Mexico,Guy from Mexico is in NY and is cooperating. D...,2,1,1,1,1,1,2,3,1,1,,,"[Guy from Mexico is in NY and is cooperating, ...","[6125, 43, 1011, 12, 10, 1566, 5, 12, 6700, 1,..."
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4595,195,1353855,Comments of Wolak,"Wolak makes some good points. In ERCOT, Enron...",2,3,2,3,2,2,2,2,2,2,,,"[Wolak makes some good points, In ERCOT, Enr...","[36543, 433, 72, 75, 555, 1, 9, 166, 2845, 3, ..."
4596,196,1131834,NBC,The reason NBC will not take cash is the prefe...,2,2,1,1,2,2,2,2,2,2,,,[The reason NBC will not take cash is the pref...,"[29, 404, 4399, 28, 26, 134, 642, 12, 2, 4060,..."
4597,197,725369,Gallup Peak,All GC's\n\nAfter utilizing the Gallup Peak Av...,3,3,3,3,3,2,2,3,2,3,,,[All GC's\n\nAfter utilizing the Gallup Peak A...,"[360, 13660, 25, 15, 444, 18330, 2, 9777, 9551..."
4598,198,766379,New TW Contract System,"Lindy,\n\nJust wanted to let you know that we ...",3,2,1,2,2,3,2,2,2,2,,,"[Lindy,\n\nJust wanted to let you know that we...","[8276, 3, 15, 389, 202, 4, 179, 13, 68, 11, 22..."


In [21]:
train.data

Unnamed: 0.1,Unnamed: 0,text_id,subject,text,ratingA1,ratingA2,ratingA3,labelA,ratingM1,ratingM2,ratingM3,ratingM4,ratingM5,labelM,question_title,question,sentences_list,encoding
0,0,C05796441_2,,Cheryl:\n\nAre we in a good place to begin pap...,3,2,1,2,2,2,3,1,2,2,,,[Cheryl:\n\nAre we in a good place to begin pa...,"[527, 107, 15, 1437, 22, 10, 7, 75, 97, 4, 870..."
1,1,C05786430_1,Department of State,"Our friend, General Joe Ballard owns The Raven...",2,1,3,2,3,2,3,1,3,3,,,"[Our friend, General Joe Ballard owns The Rave...","[398, 370, 3, 1247, 2174, 9568, 3569, 29, 9569..."
2,2,C05780653_3,,Outstanding news! Miki Rakic called about 10 m...,2,3,3,3,2,2,3,2,3,3,,,[Outstanding news! Miki Rakic called about 10 ...,"[14045, 626, 37, 14046, 19230, 253, 54, 355, 3..."
3,3,C05782181_1,Libyan CG Pol Dirs mtg @ Istanbul @ 14:00 Thur...,Responding to separate emails from Uzra + Jeff...,1,2,1,1,2,2,1,3,1,1,,,[Responding to separate emails from Uzra + Jef...,"[19243, 4, 1518, 3177, 43, 14049, 1133, 581, 1..."
4,4,C05785147_0,Mexico,Guy from Mexico is in NY and is cooperating. D...,2,1,1,1,1,1,2,3,1,1,,,"[Guy from Mexico is in NY and is cooperating, ...","[6125, 43, 1011, 12, 10, 1566, 5, 12, 6700, 1,..."
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4595,195,1353855,Comments of Wolak,"Wolak makes some good points. In ERCOT, Enron...",2,3,2,3,2,2,2,2,2,2,,,"[Wolak makes some good points, In ERCOT, Enr...","[36543, 433, 72, 75, 555, 1, 9, 166, 2845, 3, ..."
4596,196,1131834,NBC,The reason NBC will not take cash is the prefe...,2,2,1,1,2,2,2,2,2,2,,,[The reason NBC will not take cash is the pref...,"[29, 404, 4399, 28, 26, 134, 642, 12, 2, 4060,..."
4597,197,725369,Gallup Peak,All GC's\n\nAfter utilizing the Gallup Peak Av...,3,3,3,3,3,2,2,3,2,3,,,[All GC's\n\nAfter utilizing the Gallup Peak A...,"[360, 13660, 25, 15, 444, 18330, 2, 9777, 9551..."
4598,198,766379,New TW Contract System,"Lindy,\n\nJust wanted to let you know that we ...",3,2,1,2,2,3,2,2,2,2,,,"[Lindy,\n\nJust wanted to let you know that we...","[8276, 3, 15, 389, 202, 4, 179, 13, 68, 11, 22..."


In [22]:
test.data

Unnamed: 0,text_id,subject,text,ratingA1,ratingA2,ratingA3,labelA,ratingM1,ratingM2,ratingM3,ratingM4,ratingM5,labelM,sentences_list,encoding
0,C05760125_1,Hilda Solis Tom and Craig--,Madame Secretary:\n\nThank you for reaching ou...,3,3,3,3,2,3,2,2,2,2,[Madame Secretary:\n\nThank you for reaching o...,"[1046, 44, 37, 7, 320, 16, 14, 2848, 77, 4, 44..."
1,C05768263_2,,"Cheryl, Jake,\n\nI received a call from Masood...",3,3,3,3,2,2,2,3,2,2,"[Cheryl, Jake,\n\nI received a call from Masoo...","[199, 3, 183, 3, 7, 11, 560, 9, 123, 33, 2877,..."
2,C05771873_1,Framing Statement - State Draft,We anticipate the release of what are claimed ...,3,3,3,3,3,2,3,2,1,2,[We anticipate the release of what are claimed...,"[43, 2883, 1, 1061, 6, 78, 26, 1365, 4, 19, 40..."
3,C05768528_2,,Spoke to Ed Levine today to follow up on Frida...,3,3,3,3,3,2,2,1,3,2,[Spoke to Ed Levine today to follow up on Frid...,"[2904, 4, 495, 2905, 82, 4, 445, 63, 13, 490, ..."
4,C05775052_1,The matter I raised on my end of the converati...,Purely to update: Tom had me in for lunch at t...,2,1,3,2,2,2,3,1,2,2,[Purely to update: Tom had me in for lunch at ...,"[2923, 4, 498, 37, 396, 49, 48, 8, 14, 726, 29..."
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
195,C05782457_0,"arming the rebels, women, and small arms","Kavita Ramdas, until recently the head of the ...",3,3,2,3,1,1,2,2,2,1,"[Kavita Ramdas, until recently the head of the...","[6295, 6296, 3, 463, 790, 1, 614, 6, 1, 741, 1..."
196,C05739879_1,,I called PM el-Keib this morning to get his ta...,2,3,2,3,1,1,2,2,1,1,[I called PM el-Keib this morning to get his t...,"[11, 207, 318, 6320, 20, 6321, 17, 224, 4, 55,..."
197,C05765100_1,,Department of State Ranks High as Employer for...,2,3,2,3,2,3,3,2,3,3,[Department of State Ranks High as Employer fo...,"[127, 6, 71, 6342, 6343, 25, 6344, 14, 558, 63..."
198,C05773055_1,,Dear Hillary Wanted to take a minute to thank ...,2,3,2,3,2,2,2,2,3,2,[Dear Hillary Wanted to take a minute to thank...,"[187, 285, 6364, 4, 115, 9, 2823, 4, 324, 16, ..."


# GRU model 

In [23]:
import numpy as np
import tensorflow as tf
from keras.models import Sequential
from keras.layers import Dense
from keras.layers import LSTM
from keras.layers import GRU
from keras.layers import SimpleRNN
from keras.layers import Embedding
from keras.preprocessing import sequence

In [24]:
np.random.seed(7)
X_train = sequence.pad_sequences(train.data['encoding'],maxlen = 500)
y_train = encoded#train.data['h_e']
X_test = sequence.pad_sequences(test.data['encoding'],maxlen=500)
y_test = t_encoded#test.data['h_e']





In [25]:
embedding_vector_length = 32
model_RNN = Sequential()
model_RNN.add(Embedding(40000,embedding_vector_length,input_length = 500))
model_RNN.add(SimpleRNN(32,dropout=0.2, return_sequences = True ))
model_RNN.add(SimpleRNN(32))
model_RNN.add(Dense(4,activation = 'softmax'))
model_RNN.compile(loss ='categorical_crossentropy', optimizer = 'adam', metrics=['accuracy'])
print(model_RNN.summary())
model_RNN.fit(X_train,y_train, epochs = 20, batch_size=23)

scores = model_RNN.evaluate(X_test, y_test, verbose =0)
print("Accuracy: ",(scores[1]*100))



None
Epoch 1/20
[1m200/200[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m16s[0m 71ms/step - accuracy: 0.4351 - loss: 1.1449
Epoch 2/20
[1m200/200[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m14s[0m 70ms/step - accuracy: 0.5855 - loss: 0.9259
Epoch 3/20
[1m200/200[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m14s[0m 69ms/step - accuracy: 0.8943 - loss: 0.3608
Epoch 4/20
[1m200/200[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m14s[0m 70ms/step - accuracy: 0.9818 - loss: 0.0863
Epoch 5/20
[1m200/200[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m14s[0m 69ms/step - accuracy: 0.9922 - loss: 0.0369
Epoch 6/20
[1m200/200[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m14s[0m 70ms/step - accuracy: 0.9933 - loss: 0.0270
Epoch 7/20
[1m200/200[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m15s[0m 73ms/step - accuracy: 0.9949 - loss: 0.0165
Epoch 8/20
[1m200/200[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m17s[0m 83ms/step - accuracy: 0.9952 - loss: 0.0165
Epoch 9/20
[1m200/

In [27]:
model_RNN.save("OUTPUT/RNN_model.h5")



In [20]:
# import pickle
# filename = 'model_1.sav'
# pickle.dump(model,open(filename,'wb'))



In [21]:
# type(X_train)

# X_train = np.append(train.data['similarity'][:,np.newaxis], X_train, axis=1)
# X_test = np.append(test.data['similarity'][:,np.newaxis],X_test, axis=1)

In [28]:
coh_bin = []
for i in range(4600):
  if train.data['labelA'].tolist()[i] >=2:
    coh_bin.append(1)
  else:
    coh_bin.append(0)
train.data['bin_coh']= coh_bin


coh_bin=[]
for i in range(200):
  if test.data['labelA'].tolist()[i] >=2:
    coh_bin.append(1)
  else:
    coh_bin.append(0)

test.data['bin_coh']=coh_bin

In [29]:
lst = array(train.data['bin_coh'])
encoded = to_categorical(lst)

lst = array(test.data['bin_coh'])
t_encoded = to_categorical(lst)



In [30]:
np.random.seed(7)
X_train = sequence.pad_sequences(train.data['encoding'],maxlen = 500)
y_train = encoded#train.data['h_e']
X_test = sequence.pad_sequences(test.data['encoding'],maxlen=500)
y_test = t_encoded#test.data['h_e']

# X_train = np.append(train.data['similarity'][:,np.newaxis], X_train, axis=1)
# X_test = np.append(test.data['similarity'][:,np.newaxis],X_test, axis=1)



In [31]:
embedding_vector_length = 32
model_RNN_B = Sequential()
model_RNN_B.add(Embedding(40000,embedding_vector_length,input_length = 500))
model_RNN_B.add(SimpleRNN(32,dropout=0.2, return_sequences = True ))
model_RNN_B.add(SimpleRNN(32))
model_RNN_B.add(Dense(2,activation = 'softmax'))
model_RNN_B.compile(loss ='binary_crossentropy', optimizer = 'adam', metrics=['accuracy'])
print(model_RNN_B.summary())
model_RNN_B.fit(X_train,y_train, epochs = 15 , batch_size=23)

scores = model_RNN_B.evaluate(X_test, y_test, verbose =0)
print("Accuracy: ",(scores[1]*100))


# embedding_vector_length = 32
# model_RNN = Sequential()
# model_RNN.add(Embedding(40000,embedding_vector_length,input_length = 500))
# model_RNN.add(SimpleRNN(32,dropout=0.2, return_sequences = True ))
# model_RNN.add(SimpleRNN(32))
# model_RNN.add(Dense(4,activation = 'softmax'))
# model_RNN.compile(loss ='categorical_crossentropy', optimizer = 'adam', metrics=['accuracy'])
# print(model_RNN.summary())
# model_RNN.fit(X_train,y_train, epochs = 20, batch_size=23)

# scores = model_RNN.evaluate(X_test, y_test, verbose =0)
# print("Accuracy: ",(scores[1]*100))



None
Epoch 1/15
[1m200/200[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m17s[0m 80ms/step - accuracy: 0.6506 - loss: 0.6552
Epoch 2/15
[1m200/200[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m17s[0m 83ms/step - accuracy: 0.7734 - loss: 0.4767
Epoch 3/15
[1m200/200[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m17s[0m 84ms/step - accuracy: 0.9748 - loss: 0.1043
Epoch 4/15
[1m200/200[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m16s[0m 81ms/step - accuracy: 0.9950 - loss: 0.0261
Epoch 5/15
[1m200/200[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m16s[0m 80ms/step - accuracy: 0.9973 - loss: 0.0124
Epoch 6/15
[1m200/200[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m18s[0m 88ms/step - accuracy: 0.9975 - loss: 0.0104
Epoch 7/15
[1m200/200[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m18s[0m 88ms/step - accuracy: 0.9978 - loss: 0.0097
Epoch 8/15
[1m200/200[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m20s[0m 99ms/step - accuracy: 0.9953 - loss: 0.0131
Epoch 9/15
[1m200/

In [32]:
model_RNN_B.save("OUTPUT/RNN_model_B.h5")



In [33]:
train.data = similarity_paragraph(train.data)
test.data = similarity_paragraph(test.data)

len(train_mapping)
# train_mapping
train.data['encoding'] = train_mapping
test.data['encoding'] = test_mapping

In [57]:
np.random.seed(7)
X_train = sequence.pad_sequences(train.data['encoding'],maxlen = 500)
y_train = encoded#train.data['h_e']
X_test = sequence.pad_sequences(test.data['encoding'],maxlen=500)
y_test = t_encoded#test.data['h_e']

# X_train = X_train.reshape(4600,500,1)
# X_test = X_test.reshape(200,500,1)

# print(X_train.shape)
# print(train.data['similarity'].shape)

# X_train = np.append(train.data['similarity'], X_train, axis=0)
# # X_test = np.append(test.data['similarity'],X_test, axis=0)

In [58]:
embedding_vector_length = 32
model_RNN_C = Sequential()
model_RNN_C.add(Embedding(40000,embedding_vector_length,input_length = 501))
model_RNN_C.add(SimpleRNN(32,dropout=0.2, return_sequences = True ))
model_RNN_C.add(SimpleRNN(32))
model_RNN_C.add(Dense(2,activation = 'softmax'))
model_RNN_C.compile(loss ='binary_crossentropy', optimizer = 'adam', metrics=['accuracy'])
print(model_RNN_C.summary())
model_RNN_C.fit(X_train,y_train, epochs = 15 , batch_size=23)



None
Epoch 1/15
[1m200/200[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m16s[0m 73ms/step - accuracy: 0.6573 - loss: 0.6396
Epoch 2/15
[1m200/200[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m15s[0m 73ms/step - accuracy: 0.7588 - loss: 0.4929
Epoch 3/15
[1m200/200[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m14s[0m 72ms/step - accuracy: 0.9665 - loss: 0.1325
Epoch 4/15
[1m200/200[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m15s[0m 73ms/step - accuracy: 0.9890 - loss: 0.0385
Epoch 5/15
[1m200/200[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m16s[0m 82ms/step - accuracy: 0.9962 - loss: 0.0189
Epoch 6/15
[1m200/200[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m17s[0m 85ms/step - accuracy: 0.9971 - loss: 0.0141
Epoch 7/15
[1m200/200[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m20s[0m 100ms/step - accuracy: 0.9986 - loss: 0.0067
Epoch 8/15
[1m200/200[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m19s[0m 95ms/step - accuracy: 0.9969 - loss: 0.0098
Epoch 9/15
[1m200

<keras.src.callbacks.history.History at 0x7f37dbd3d3d0>

In [59]:
scores = model_RNN_C.evaluate(X_test, y_test, verbose =0)
print("Accuracy: ",(scores[1]*100))

Accuracy:  57.999998331069946


In [61]:
model_RNN_C.save("OUTPUT/RNN_model_C.h5")

