In [1]:
import pandas as pd
import tensorflow as tf
import numpy as np
from nltk.corpus import stopwords
import re
from nltk.stem import PorterStemmer , WordNetLemmatizer
import nltk
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from sklearn.model_selection import train_test_split
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense , LSTM , Dropout , Embedding , Bidirectional
from tensorflow.keras.callbacks import ModelCheckpoint , EarlyStopping
from tensorflow.keras.optimizers import Adam
from sklearn.metrics import confusion_matrix , accuracy_score

In [2]:
nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('omw-1.4')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Unzipping corpora/wordnet.zip.
[nltk_data] Downloading package omw-1.4 to /root/nltk_data...
[nltk_data]   Unzipping corpora/omw-1.4.zip.


True

In [3]:
!gdown 1j_X5sOSZ1taeJ7KLY-kYbHAVv3Uyt0Z2

Downloading...
From: https://drive.google.com/uc?id=1j_X5sOSZ1taeJ7KLY-kYbHAVv3Uyt0Z2
To: /content/train.csv.zip
100% 38.8M/38.8M [00:00<00:00, 73.6MB/s]


In [4]:
!unzip -q /content/train.csv.zip

In [32]:
df = pd.read_csv('/content/train.csv')
df.head(3)

Unnamed: 0,id,title,author,text,label
0,0,House Dem Aide: We Didn’t Even See Comey’s Let...,Darrell Lucus,House Dem Aide: We Didn’t Even See Comey’s Let...,1
1,1,"FLYNN: Hillary Clinton, Big Woman on Campus - ...",Daniel J. Flynn,Ever get the feeling your life circles the rou...,0
2,2,Why the Truth Might Get You Fired,Consortiumnews.com,"Why the Truth Might Get You Fired October 29, ...",1


In [50]:
df['title'][201]
df['label'][201]

1

In [6]:
df = df.dropna()

In [7]:
df = df.reset_index()

In [None]:
# df['title'] = df['title'].apply(lambda x : str(x))

In [8]:
lemma = WordNetLemmatizer()

In [9]:
corpus = []
for i in range(len(df['title'])):
  text = re.sub('[^A-Za-z]',' ',df['title'][i])
  text = text.lower()
  text = text.split()
  text = [lemma.lemmatize(word) for word in text if word not in stopwords.words('english')]
  texts = ' '.join(text)
  corpus.append(texts)

In [12]:
corpus[0]

'house dem aide even see comey letter jason chaffetz tweeted'

In [10]:
tk = Tokenizer()

In [11]:
tk.fit_on_texts(corpus)

In [13]:
total_words = len(tk.word_index) + 1

In [14]:
print(total_words)

17491


In [None]:
# tk.word_index.items()

In [15]:
text_to_num = []
for text in corpus:
  texts = tk.texts_to_sequences([text])[0]
  text_to_num.append(texts)

In [16]:
text_to_num[0]

[29, 526, 558, 249, 136, 81, 449, 1734, 3154, 6801]

In [17]:
max_len = max([len(text) for text in text_to_num])

In [18]:
print(f'max len sentence : {max_len}')

max len sentence : 47


In [19]:
padded_seq = pad_sequences(text_to_num,maxlen=47)

In [20]:
padded_seq[0]

array([   0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,   29,  526,  558,  249,  136,   81,  449,
       1734, 3154, 6801], dtype=int32)

In [21]:
y = df['label']

In [22]:
X = padded_seq

In [23]:
del padded_seq

In [24]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.20, random_state=42)

In [25]:
model = Sequential()
model.add(Embedding(total_words,100,input_length=max_len))
model.add(Dropout(0.5))
model.add(Bidirectional(LSTM(100,return_sequences=True)))
model.add(Dropout(0.5))
model.add(Bidirectional(LSTM(100)))
model.add(Dropout(0.5))
model.add(Dense(1,activation='sigmoid'))
adam = Adam(learning_rate=0.001)
model.compile(loss='binary_crossentropy',metrics=['Accuracy'],optimizer=adam)
check = ModelCheckpoint('fake_news_classifier.h5',mode='min',save_best_only=True)

In [None]:
model.fit(X_train, y_train, epochs=50, verbose=1,batch_size= 64 , callbacks=[check],validation_data=(X_test,y_test))

In [27]:
y_pred = model.predict(X_test)

In [28]:
print(confusion_matrix(y_test,y_pred.round()))

[[1919  163]
 [ 165 1410]]


In [29]:
print(f'accuracy score : {accuracy_score(y_test,y_pred.round())}')

accuracy score : 0.9103089964451736


In [51]:
predict = 'Doctors Mysteriously Found Dead After Summit For Breakthrough Cure For Cancer'

In [52]:
sequence = tk.texts_to_sequences([predict])[0]
# print(sequence)
padd_predict_text = pad_sequences([sequence] , maxlen= 47 )
pred = model.predict(padd_predict_text)
print(f'Prediction : {pred[0][0]:.2f}')

Prediction : 1.00
