In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [2]:
import numpy as np
import pandas as pd

import keras
from keras import Sequential
from keras.layers import LSTM, Bidirectional, Dense, Embedding, Dropout
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences

import pickle

In [3]:
df = pd.read_csv("/content/drive/MyDrive/research project/clean.csv")

In [4]:
df.head(10)

Unnamed: 0,id,label,clean
0,0,1,hous dem aid even see comey letter jason chaff...
1,1,0,flynn hillari clinton big woman campu breitbar...
2,2,1,truth might get fire truth might get fire octo...
3,3,1,civilian kill singl us airstrik identifi video...
4,4,1,iranian woman jail fiction unpublish stori wom...
5,5,0,jacki mason hollywood would love trump bomb no...
6,7,0,beno hamon win french socialist parti presiden...
7,9,0,back channel plan ukrain russia courtesi trump...
8,10,0,obama organ action partner soro link indivis d...
9,11,0,bbc comedi sketch real housew isi caus outrag ...


In [6]:
df= df.dropna()
df.isnull().sum()


id       0
label    0
clean    0
dtype: int64

In [7]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 18271 entries, 0 to 18284
Data columns (total 3 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   id      18271 non-null  int64 
 1   label   18271 non-null  int64 
 2   clean   18271 non-null  object
dtypes: int64(2), object(1)
memory usage: 571.0+ KB


##PREPARE THE DATA BY PERFORMING TOKENIZATION AND PADDING

In [None]:
# split data into test and train 
from sklearn.model_selection import train_test_split
x_train, x_test, y_train, y_test = train_test_split(df.clean, df.label, test_size = 0.2, random_state =42)


In [None]:
y_train = np.asarray(y_train)
y_test = np.asarray(y_test)


In [None]:
vocab_size = 20000
embedding_dim = 120

tokenizer = Tokenizer(num_words = vocab_size)
tokenizer.fit_on_texts(x_train)
train_sequences = tokenizer.texts_to_sequences(x_train) #applying the tokenizer
test_sequences = tokenizer.texts_to_sequences(x_test)

In [None]:
print("The encoding for document\n",df.clean[0],"\n is : ",train_sequences[0])

The encoding for document
 hous dem aid even see comey letter jason chaffetz tweet hous dem aid even see comey letter jason chaffetz tweet darrel lucu octob subscrib jason chaffetz stump american fork utah imag courtesi michael jolley avail creativ common licens apolog keith olbermann doubt worst person world week fbi director jame comey accord hous democrat aid look like also know second worst person well turn comey sent infam letter announc fbi look email may relat hillari clinton email server rank democrat relev committe hear comey found via tweet one republican committe chairmen know comey notifi republican chairmen democrat rank member hous intellig judiciari oversight committe agenc review email recent discov order see contain classifi inform long letter went oversight committe chairman jason chaffetz set polit world ablaz tweet fbi dir inform fbi learn exist email appear pertin investig case reopen jason chaffetz jasoninthehous octob cours know case comey actual say review email

In [None]:
padded_train = pad_sequences(train_sequences,maxlen = 40, padding = 'post', truncating = 'post')

padded_test = pad_sequences(test_sequences,maxlen = 40, truncating = 'post', padding = 'post') 

## Building Model

In [None]:
#using bi directional lstm
model = Sequential()
model.add(Embedding(vocab_size, embedding_dim))
model.add(Bidirectional(LSTM(128)))
model.add(Dropout(0.3))
model.add(Dense(128, activation='relu'))
model.add(Dropout(0.3))
model.add(Dense(1, activation='sigmoid'))
model.summary()

'''# using normal lstm
model = Sequential()
model.add(Embedding(vocab_size,embedding_dim))
model.add(Dropout(0.3))
model.add(LSTM(100))
model.add(Dropout(0.3))
model.add(Dense(64,activation='relu'))
model.add(Dropout(0.3))
model.add(Dense(1,activation='sigmoid'))
model.compile(loss='binary_crossentropy',optimizer='adam',metrics=['accuracy'])
print(model.summary())
'''

Model: "sequential_1"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_1 (Embedding)      (None, None, 120)         2400000   
_________________________________________________________________
bidirectional_1 (Bidirection (None, 256)               254976    
_________________________________________________________________
dropout_2 (Dropout)          (None, 256)               0         
_________________________________________________________________
dense_2 (Dense)              (None, 128)               32896     
_________________________________________________________________
dropout_3 (Dropout)          (None, 128)               0         
_________________________________________________________________
dense_3 (Dense)              (None, 1)                 129       
Total params: 2,688,001
Trainable params: 2,688,001
Non-trainable params: 0
____________________________________________

"# using normal lstm\nmodel = Sequential()\nmodel.add(Embedding(vocab_size,embedding_dim))\nmodel.add(Dropout(0.3))\nmodel.add(LSTM(100))\nmodel.add(Dropout(0.3))\nmodel.add(Dense(64,activation='relu'))\nmodel.add(Dropout(0.3))\nmodel.add(Dense(1,activation='sigmoid'))\nmodel.compile(loss='binary_crossentropy',optimizer='adam',metrics=['accuracy'])\nprint(model.summary())\n"

In [None]:
model.compile(optimizer='adam', loss = 'binary_crossentropy', metrics = ['accuracy'])

In [None]:
model.fit(padded_train, y_train, batch_size = 64, validation_data= (padded_test, y_test), epochs = 2) #bi - 2 epochs

Epoch 1/2
Epoch 2/2


<tensorflow.python.keras.callbacks.History at 0x7f36058eced0>

In [None]:
from keras.models import load_model

model.save('/content/drive/MyDrive/research project/model.h5')