In [44]:
import torch
import os
import argparse
from torch.utils.data import Dataset, DataLoader
import torchtext
from collections import Counter
import numpy as np
import pandas as pd
import pickle
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from numpy import array
from numpy import argmax
from tensorflow.keras.utils import to_categorical

import tensorflow as tf
from keras.models import Sequential
from keras.layers import Dense
from keras.layers import LSTM
from keras.layers import GRU
from keras.layers import SimpleRNN
from keras.layers import Embedding
from keras.preprocessing import sequence

import pickle

In [45]:
nltk.download('stopwords')
nltk.download('punkt')
sw = stopwords.words('english') 

[nltk_data] Downloading package stopwords to /home/smruti/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to /home/smruti/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [46]:
class Tokenizer:
    def __init__(self, file, threshold=5):
        self.file = file
        self.data = pd.read_csv(file)
        self.threshold = threshold

    def preprocess(self):
        tokenizer = torchtext.data.utils.get_tokenizer('spacy', language='en')
        tokens = []
        sentence_list=[]
        for text in self.data['text'].tolist():
            tokens.append(tokenizer(text))
            sentence_list.append(text.split('.'))

        self.data['sentences_list'] = sentence_list
        counter = Counter()
        for line in tokens:
            for word in line:
                counter[word] += 1
        mapper = {word[0]: idx+1 for idx,
                  word in enumerate(counter.most_common())}
        inverse_mapper = {idx+1: word[0] for idx,
                          word in enumerate(counter.most_common())}
        other_idx = len(counter.keys())

        mapped_tokens = []

        for line in tokens:
            mapped_line = []
            for word in line:
              # map words to their mappings and to other otherwise
                mapped_line.append(mapper.get(word, other_idx))
            mapped_tokens.append(mapped_line)
        return mapped_tokens, inverse_mapper

In [47]:
def similarity_paragraph(data):
    # data = self.data
    sim_list = []
    for para in data['sentences_list'].tolist():
      sim = 2000
      start = para[0]
      para = para[1:]
      for sent in para:            
        # tokenization
        X_list = word_tokenize(start) 
        Y_list = word_tokenize(sent)
          
        # sw contains the list of stopwords
        l1 =[];l2 =[]
          
        # remove stop words from the string
        X_set = {w for w in X_list if not w in sw} 
        Y_set = {w for w in Y_list if not w in sw}
          
        # form a set containing keywords of both strings 
        rvector = X_set.union(Y_set) 
        for w in rvector:
            if w in X_set: l1.append(1)
            else: l1.append(0)
            if w in Y_set: l2.append(1)
            else: l2.append(0)
        c = 0
          
        # cosine formula 
        for i in range(len(rvector)):
            c+= l1[i]*l2[i]
        try:
          cosine = c / float((sum(l1)*sum(l2))**0.5)
          if sim > cosine:
            sim=cosine
        except:
          sim += 0
          
        start = sent
      sim_list.append(sim)
    
    data['similarity'] = sim_list
    return data

In [61]:
# training data
data_clin = pd.read_csv('Datasets/GCDC_Corpus_v2/Clinton_train.csv')
data_yah = pd.read_csv('Datasets/GCDC_Corpus_v2/Yahoo_train.csv')
data_yelp = pd.read_csv('Datasets/GCDC_Corpus_v2/Yelp_train.csv')
data_enr = pd.read_csv('Datasets/GCDC_Corpus_v2/Enron_train.csv')

data_yahTest = pd.read_csv('Datasets/GCDC_Corpus_v2/Yahoo_test.csv')
data_yelpTest = pd.read_csv('Datasets/GCDC_Corpus_v2/Yelp_test.csv')
data_enrTest = pd.read_csv('Datasets/GCDC_Corpus_v2/Enron_test.csv')

data_clin.head()

print(data_clin.shape, data_yah.shape, data_yelp.shape, data_enr.shape, data_yahTest.shape, data_yelpTest.shape, data_enrTest.shape)

(1000, 13) (1000, 14) (1000, 13) (1000, 13) (200, 14) (200, 13) (200, 13)


In [62]:
data = pd.concat([data_clin, data_yah, data_yelp, data_enr, data_yahTest, data_yelpTest, data_enrTest])
data.head(20)

Unnamed: 0,text_id,subject,text,ratingA1,ratingA2,ratingA3,labelA,ratingM1,ratingM2,ratingM3,ratingM4,ratingM5,labelM,question_title,question
0,C05796441_2,,Cheryl:\n\nAre we in a good place to begin pap...,3,2,1,2,2,2,3,1,2,2,,
1,C05786430_1,Department of State,"Our friend, General Joe Ballard owns The Raven...",2,1,3,2,3,2,3,1,3,3,,
2,C05780653_3,,Outstanding news! Miki Rakic called about 10 m...,2,3,3,3,2,2,3,2,3,3,,
3,C05782181_1,Libyan CG Pol Dirs mtg @ Istanbul @ 14:00 Thur...,Responding to separate emails from Uzra + Jeff...,1,2,1,1,2,2,1,3,1,1,,
4,C05785147_0,Mexico,Guy from Mexico is in NY and is cooperating. D...,2,1,1,1,1,1,2,3,1,1,,
5,C05759844_1,,"For Secretary Clinton\n\nDear Hillary,\n\nYou ...",2,3,3,3,3,2,3,3,3,3,,
6,C05778292_0,Further Strong Words From Raj Shah,I'm sure you saw Raj's speech last week; I tho...,2,1,2,1,2,1,1,2,2,1,,
7,C05772086_1,Thank You,"Cheryl,\n\nI received word from the White Hous...",3,3,3,3,3,2,1,2,3,2,,
8,C05780274_2,Do you have any recs to replace you?,"Cheryl:\n\nAgain, I'm so glad you asked. The B...",1,2,3,2,2,3,1,2,3,2,,
9,C05767214_1,,Rich---Thanks for all you did to make the past...,2,1,2,1,1,2,1,2,1,1,,


In [63]:
data.to_csv('Preprocessing/GCDC_Corpus_v2/new_train.csv')

train = Tokenizer("Preprocessing/GCDC_Corpus_v2/new_train.csv")
test = Tokenizer("Datasets/GCDC_Corpus_v2/Clinton_test.csv")

len(train.data)
print(f'Train data: {len(train.data)}')
print(f'Test data: {len(test.data)}')

Train data: 4600
Test data: 200


In [64]:
lst = array(train.data['labelA'])
encoded = to_categorical(lst)
print(encoded)

[[0. 0. 1. 0.]
 [0. 0. 1. 0.]
 [0. 0. 0. 1.]
 ...
 [0. 0. 0. 1.]
 [0. 0. 1. 0.]
 [0. 1. 0. 0.]]


In [65]:
lst = array(test.data['labelA'])
t_encoded = to_categorical(lst)

In [66]:
from torchtext import data
import warnings
warnings.filterwarnings("ignore")

train_mapping, inv_train_mapping = train.preprocess()
test_mapping, inv_test_mapping = test.preprocess()

In [67]:
len(train_mapping)
print(f'Train mapping: {len(train_mapping)}')

# train_mapping
train.data['encoding'] = train_mapping
test.data['encoding'] = test_mapping
train.data

Train mapping: 4600


Unnamed: 0.1,Unnamed: 0,text_id,subject,text,ratingA1,ratingA2,ratingA3,labelA,ratingM1,ratingM2,ratingM3,ratingM4,ratingM5,labelM,question_title,question,sentences_list,encoding
0,0,C05796441_2,,Cheryl:\n\nAre we in a good place to begin pap...,3,2,1,2,2,2,3,1,2,2,,,[Cheryl:\n\nAre we in a good place to begin pa...,"[527, 107, 15, 1437, 22, 10, 7, 75, 97, 4, 870..."
1,1,C05786430_1,Department of State,"Our friend, General Joe Ballard owns The Raven...",2,1,3,2,3,2,3,1,3,3,,,"[Our friend, General Joe Ballard owns The Rave...","[398, 370, 3, 1247, 2174, 9568, 3569, 29, 9569..."
2,2,C05780653_3,,Outstanding news! Miki Rakic called about 10 m...,2,3,3,3,2,2,3,2,3,3,,,[Outstanding news! Miki Rakic called about 10 ...,"[14045, 626, 37, 14046, 19230, 253, 54, 355, 3..."
3,3,C05782181_1,Libyan CG Pol Dirs mtg @ Istanbul @ 14:00 Thur...,Responding to separate emails from Uzra + Jeff...,1,2,1,1,2,2,1,3,1,1,,,[Responding to separate emails from Uzra + Jef...,"[19243, 4, 1518, 3177, 43, 14049, 1133, 581, 1..."
4,4,C05785147_0,Mexico,Guy from Mexico is in NY and is cooperating. D...,2,1,1,1,1,1,2,3,1,1,,,"[Guy from Mexico is in NY and is cooperating, ...","[6125, 43, 1011, 12, 10, 1566, 5, 12, 6700, 1,..."
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4595,195,1353855,Comments of Wolak,"Wolak makes some good points. In ERCOT, Enron...",2,3,2,3,2,2,2,2,2,2,,,"[Wolak makes some good points, In ERCOT, Enr...","[36543, 433, 72, 75, 555, 1, 9, 166, 2845, 3, ..."
4596,196,1131834,NBC,The reason NBC will not take cash is the prefe...,2,2,1,1,2,2,2,2,2,2,,,[The reason NBC will not take cash is the pref...,"[29, 404, 4399, 28, 26, 134, 642, 12, 2, 4060,..."
4597,197,725369,Gallup Peak,All GC's\n\nAfter utilizing the Gallup Peak Av...,3,3,3,3,3,2,2,3,2,3,,,[All GC's\n\nAfter utilizing the Gallup Peak A...,"[360, 13660, 25, 15, 444, 18330, 2, 9777, 9551..."
4598,198,766379,New TW Contract System,"Lindy,\n\nJust wanted to let you know that we ...",3,2,1,2,2,3,2,2,2,2,,,"[Lindy,\n\nJust wanted to let you know that we...","[8276, 3, 15, 389, 202, 4, 179, 13, 68, 11, 22..."


In [68]:
train.data

Unnamed: 0.1,Unnamed: 0,text_id,subject,text,ratingA1,ratingA2,ratingA3,labelA,ratingM1,ratingM2,ratingM3,ratingM4,ratingM5,labelM,question_title,question,sentences_list,encoding
0,0,C05796441_2,,Cheryl:\n\nAre we in a good place to begin pap...,3,2,1,2,2,2,3,1,2,2,,,[Cheryl:\n\nAre we in a good place to begin pa...,"[527, 107, 15, 1437, 22, 10, 7, 75, 97, 4, 870..."
1,1,C05786430_1,Department of State,"Our friend, General Joe Ballard owns The Raven...",2,1,3,2,3,2,3,1,3,3,,,"[Our friend, General Joe Ballard owns The Rave...","[398, 370, 3, 1247, 2174, 9568, 3569, 29, 9569..."
2,2,C05780653_3,,Outstanding news! Miki Rakic called about 10 m...,2,3,3,3,2,2,3,2,3,3,,,[Outstanding news! Miki Rakic called about 10 ...,"[14045, 626, 37, 14046, 19230, 253, 54, 355, 3..."
3,3,C05782181_1,Libyan CG Pol Dirs mtg @ Istanbul @ 14:00 Thur...,Responding to separate emails from Uzra + Jeff...,1,2,1,1,2,2,1,3,1,1,,,[Responding to separate emails from Uzra + Jef...,"[19243, 4, 1518, 3177, 43, 14049, 1133, 581, 1..."
4,4,C05785147_0,Mexico,Guy from Mexico is in NY and is cooperating. D...,2,1,1,1,1,1,2,3,1,1,,,"[Guy from Mexico is in NY and is cooperating, ...","[6125, 43, 1011, 12, 10, 1566, 5, 12, 6700, 1,..."
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4595,195,1353855,Comments of Wolak,"Wolak makes some good points. In ERCOT, Enron...",2,3,2,3,2,2,2,2,2,2,,,"[Wolak makes some good points, In ERCOT, Enr...","[36543, 433, 72, 75, 555, 1, 9, 166, 2845, 3, ..."
4596,196,1131834,NBC,The reason NBC will not take cash is the prefe...,2,2,1,1,2,2,2,2,2,2,,,[The reason NBC will not take cash is the pref...,"[29, 404, 4399, 28, 26, 134, 642, 12, 2, 4060,..."
4597,197,725369,Gallup Peak,All GC's\n\nAfter utilizing the Gallup Peak Av...,3,3,3,3,3,2,2,3,2,3,,,[All GC's\n\nAfter utilizing the Gallup Peak A...,"[360, 13660, 25, 15, 444, 18330, 2, 9777, 9551..."
4598,198,766379,New TW Contract System,"Lindy,\n\nJust wanted to let you know that we ...",3,2,1,2,2,3,2,2,2,2,,,"[Lindy,\n\nJust wanted to let you know that we...","[8276, 3, 15, 389, 202, 4, 179, 13, 68, 11, 22..."


In [69]:
test.data

Unnamed: 0,text_id,subject,text,ratingA1,ratingA2,ratingA3,labelA,ratingM1,ratingM2,ratingM3,ratingM4,ratingM5,labelM,sentences_list,encoding
0,C05760125_1,Hilda Solis Tom and Craig--,Madame Secretary:\n\nThank you for reaching ou...,3,3,3,3,2,3,2,2,2,2,[Madame Secretary:\n\nThank you for reaching o...,"[1046, 44, 37, 7, 320, 16, 14, 2848, 77, 4, 44..."
1,C05768263_2,,"Cheryl, Jake,\n\nI received a call from Masood...",3,3,3,3,2,2,2,3,2,2,"[Cheryl, Jake,\n\nI received a call from Masoo...","[199, 3, 183, 3, 7, 11, 560, 9, 123, 33, 2877,..."
2,C05771873_1,Framing Statement - State Draft,We anticipate the release of what are claimed ...,3,3,3,3,3,2,3,2,1,2,[We anticipate the release of what are claimed...,"[43, 2883, 1, 1061, 6, 78, 26, 1365, 4, 19, 40..."
3,C05768528_2,,Spoke to Ed Levine today to follow up on Frida...,3,3,3,3,3,2,2,1,3,2,[Spoke to Ed Levine today to follow up on Frid...,"[2904, 4, 495, 2905, 82, 4, 445, 63, 13, 490, ..."
4,C05775052_1,The matter I raised on my end of the converati...,Purely to update: Tom had me in for lunch at t...,2,1,3,2,2,2,3,1,2,2,[Purely to update: Tom had me in for lunch at ...,"[2923, 4, 498, 37, 396, 49, 48, 8, 14, 726, 29..."
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
195,C05782457_0,"arming the rebels, women, and small arms","Kavita Ramdas, until recently the head of the ...",3,3,2,3,1,1,2,2,2,1,"[Kavita Ramdas, until recently the head of the...","[6295, 6296, 3, 463, 790, 1, 614, 6, 1, 741, 1..."
196,C05739879_1,,I called PM el-Keib this morning to get his ta...,2,3,2,3,1,1,2,2,1,1,[I called PM el-Keib this morning to get his t...,"[11, 207, 318, 6320, 20, 6321, 17, 224, 4, 55,..."
197,C05765100_1,,Department of State Ranks High as Employer for...,2,3,2,3,2,3,3,2,3,3,[Department of State Ranks High as Employer fo...,"[127, 6, 71, 6342, 6343, 25, 6344, 14, 558, 63..."
198,C05773055_1,,Dear Hillary Wanted to take a minute to thank ...,2,3,2,3,2,2,2,2,3,2,[Dear Hillary Wanted to take a minute to thank...,"[187, 285, 6364, 4, 115, 9, 2823, 4, 324, 16, ..."


In [70]:
np.random.seed(7)
X_train = sequence.pad_sequences(train.data['encoding'],maxlen = 500)
y_train = encoded#train.data['h_e']
X_test = sequence.pad_sequences(test.data['encoding'],maxlen=500)
y_test = t_encoded#test.data['h_e']

In [71]:
embedding_vector_length = 32
model_RNN = Sequential()
model_RNN.add(Embedding(40000,embedding_vector_length,input_length = 500))
model_RNN.add(SimpleRNN(32,dropout=0.2, return_sequences = True ))
model_RNN.add(SimpleRNN(32))
model_RNN.add(Dense(4,activation = 'softmax'))
model_RNN.compile(loss ='categorical_crossentropy', optimizer = 'adam', metrics=['accuracy'])
print(model_RNN.summary())
model_RNN.fit(X_train,y_train, epochs = 20, batch_size=23)

scores = model_RNN.evaluate(X_test, y_test, verbose =0)
print("Accuracy: ",(scores[1]*100))

None
Epoch 1/20
[1m200/200[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m17s[0m 80ms/step - accuracy: 0.4076 - loss: 1.1767
Epoch 2/20
[1m200/200[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m15s[0m 75ms/step - accuracy: 0.5754 - loss: 0.9195
Epoch 3/20
[1m200/200[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m19s[0m 93ms/step - accuracy: 0.9257 - loss: 0.2973
Epoch 4/20
[1m200/200[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m16s[0m 80ms/step - accuracy: 0.9864 - loss: 0.0767
Epoch 5/20
[1m200/200[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m18s[0m 89ms/step - accuracy: 0.9929 - loss: 0.0338
Epoch 6/20
[1m200/200[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m16s[0m 81ms/step - accuracy: 0.9922 - loss: 0.0243
Epoch 7/20
[1m200/200[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m17s[0m 87ms/step - accuracy: 0.9933 - loss: 0.0217
Epoch 8/20
[1m200/200[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m17s[0m 87ms/step - accuracy: 0.9944 - loss: 0.0188
Epoch 9/20
[1m200/

In [72]:
model_RNN.save("Models/RNN_model_gcdc.h5")

# make file for the model
if not os.path.exists('Models'):
    os.makedirs('Models')
    
with open('Models/RNN_model_categorical.pkl', 'wb') as f:
    pickle.dump(model_RNN, f)
    
# filename = 'model_1.sav'
# pickle.dump(model,open(filename,'wb'))



In [73]:
coh_bin = []
for i in range(4600):
  if train.data['labelA'].tolist()[i] >=2:
    coh_bin.append(1)
  else:
    coh_bin.append(0)
train.data['bin_coh']= coh_bin


coh_bin=[]
for i in range(200):
  if test.data['labelA'].tolist()[i] >=2:
    coh_bin.append(1)
  else:
    coh_bin.append(0)

test.data['bin_coh']=coh_bin

lst = array(train.data['bin_coh'])
encoded = to_categorical(lst)

lst = array(test.data['bin_coh'])
t_encoded = to_categorical(lst)

In [74]:
np.random.seed(7)
X_train = sequence.pad_sequences(train.data['encoding'],maxlen = 500)
y_train = encoded#train.data['h_e']
X_test = sequence.pad_sequences(test.data['encoding'],maxlen=500)
y_test = t_encoded#test.data['h_e']

In [75]:
embedding_vector_length = 32
model_RNN_B = Sequential()
model_RNN_B.add(Embedding(40000,embedding_vector_length,input_length = 500))
model_RNN_B.add(SimpleRNN(32,dropout=0.2, return_sequences = True ))
model_RNN_B.add(SimpleRNN(32))
model_RNN_B.add(Dense(2,activation = 'softmax'))
model_RNN_B.compile(loss ='binary_crossentropy', optimizer = 'adam', metrics=['accuracy'])
print(model_RNN_B.summary())
model_RNN_B.fit(X_train,y_train, epochs = 15 , batch_size=23)

scores = model_RNN_B.evaluate(X_test, y_test, verbose =0)
print("Accuracy: ",(scores[1]*100))

None
Epoch 1/15
[1m200/200[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m18s[0m 82ms/step - accuracy: 0.6590 - loss: 0.6455
Epoch 2/15
[1m200/200[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m16s[0m 78ms/step - accuracy: 0.7584 - loss: 0.5000
Epoch 3/15
[1m200/200[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m15s[0m 75ms/step - accuracy: 0.9643 - loss: 0.1313
Epoch 4/15
[1m200/200[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m15s[0m 75ms/step - accuracy: 0.9962 - loss: 0.0274
Epoch 5/15
[1m200/200[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m15s[0m 75ms/step - accuracy: 0.9986 - loss: 0.0123
Epoch 6/15
[1m200/200[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m15s[0m 74ms/step - accuracy: 0.9920 - loss: 0.0257
Epoch 7/15
[1m200/200[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m15s[0m 77ms/step - accuracy: 0.9971 - loss: 0.0122
Epoch 8/15
[1m200/200[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m15s[0m 77ms/step - accuracy: 0.9937 - loss: 0.0168
Epoch 9/15
[1m200/

In [82]:
model_RNN_B.save("Models/RNN_model_BinaryCrossEntroppy.h5")

with open('Models/RNN_model_BinaryCrossEntroppy.pkl', 'wb') as f:
    pickle.dump(model_RNN_B, f)



In [77]:
train.data = similarity_paragraph(train.data)
test.data = similarity_paragraph(test.data)

len(train_mapping)
# train_mapping
train.data['encoding'] = train_mapping
test.data['encoding'] = test_mapping

In [78]:
np.random.seed(7)
X_train = sequence.pad_sequences(train.data['encoding'],maxlen = 500)
y_train = encoded#train.data['h_e']
X_test = sequence.pad_sequences(test.data['encoding'],maxlen=500)
y_test = t_encoded#test.data['h_e']

print(X_train.shape)
print(train.data['similarity'].shape)
train.data.head()




(4600, 500)
(4600,)


Unnamed: 0.1,Unnamed: 0,text_id,subject,text,ratingA1,ratingA2,ratingA3,labelA,ratingM1,ratingM2,ratingM3,ratingM4,ratingM5,labelM,question_title,question,sentences_list,encoding,bin_coh,similarity
0,0,C05796441_2,,Cheryl:\n\nAre we in a good place to begin pap...,3,2,1,2,2,2,3,1,2,2,,,[Cheryl:\n\nAre we in a good place to begin pa...,"[527, 107, 15, 1437, 22, 10, 7, 75, 97, 4, 870...",1,0.0
1,1,C05786430_1,Department of State,"Our friend, General Joe Ballard owns The Raven...",2,1,3,2,3,2,3,1,3,3,,,"[Our friend, General Joe Ballard owns The Rave...","[398, 370, 3, 1247, 2174, 9568, 3569, 29, 9569...",1,0.0
2,2,C05780653_3,,Outstanding news! Miki Rakic called about 10 m...,2,3,3,3,2,2,3,2,3,3,,,[Outstanding news! Miki Rakic called about 10 ...,"[14045, 626, 37, 14046, 19230, 253, 54, 355, 3...",1,0.0
3,3,C05782181_1,Libyan CG Pol Dirs mtg @ Istanbul @ 14:00 Thur...,Responding to separate emails from Uzra + Jeff...,1,2,1,1,2,2,1,3,1,1,,,[Responding to separate emails from Uzra + Jef...,"[19243, 4, 1518, 3177, 43, 14049, 1133, 581, 1...",0,0.0
4,4,C05785147_0,Mexico,Guy from Mexico is in NY and is cooperating. D...,2,1,1,1,1,1,2,3,1,1,,,"[Guy from Mexico is in NY and is cooperating, ...","[6125, 43, 1011, 12, 10, 1566, 5, 12, 6700, 1,...",0,0.0


In [79]:
embedding_vector_length = 32
model_RNN_C = Sequential()
model_RNN_C.add(Embedding(40000,embedding_vector_length,input_length = 501))
model_RNN_C.add(SimpleRNN(32,dropout=0.2, return_sequences = True ))
model_RNN_C.add(SimpleRNN(32))
model_RNN_C.add(Dense(2,activation = 'softmax'))
model_RNN_C.compile(loss ='binary_crossentropy', optimizer = 'adam', metrics=['accuracy'])
print(model_RNN_C.summary())
model_RNN_C.fit(X_train,y_train, epochs = 15 , batch_size=23)

None
Epoch 1/15
[1m200/200[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m16s[0m 74ms/step - accuracy: 0.6528 - loss: 0.6520
Epoch 2/15
[1m200/200[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m15s[0m 74ms/step - accuracy: 0.7933 - loss: 0.4686
Epoch 3/15
[1m200/200[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m15s[0m 74ms/step - accuracy: 0.9683 - loss: 0.1263
Epoch 4/15
[1m200/200[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m15s[0m 74ms/step - accuracy: 0.9930 - loss: 0.0347
Epoch 5/15
[1m200/200[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m15s[0m 74ms/step - accuracy: 0.9975 - loss: 0.0172
Epoch 6/15
[1m200/200[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m15s[0m 73ms/step - accuracy: 0.9965 - loss: 0.0185
Epoch 7/15
[1m200/200[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m15s[0m 73ms/step - accuracy: 0.9923 - loss: 0.0202
Epoch 8/15
[1m200/200[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m15s[0m 74ms/step - accuracy: 0.9968 - loss: 0.0105
Epoch 9/15
[1m200/

<keras.src.callbacks.history.History at 0x7fc829749e90>

In [80]:
scores = model_RNN_C.evaluate(X_test, y_test, verbose =0)
print("Accuracy: ",(scores[1]*100))

Accuracy:  60.50000190734863


In [85]:
model_RNN_C.save("Models/RNN_model_C.h5")

