In [2]:
import pandas as pd
import numpy as np
from sklearn.metrics import classification_report, f1_score

In [3]:
train_data = pd.read_csv("fake-news/train.csv")

In [4]:
train_data.isna().sum() 

id           0
title      558
author    1957
text        39
label        0
dtype: int64

### Processing data

In [5]:
train_data = train_data.dropna()

In [6]:
y = train_data[['label']].reset_index(drop=True)
x = train_data.drop('label', axis=1).reset_index(drop=True)

In [7]:
messages =x.copy()

In [8]:
messages.reset_index(inplace=True)

In [9]:
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer, HashingVectorizer
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
import re
from nltk.stem.porter import PorterStemmer

In [11]:
ps = PorterStemmer()
corpus = []
for sent in x['title']:
    sent = re.sub(r"[^a-zA-Z]"," ",sent)
    sent = word_tokenize(sent)
    sent = [ps.stem(word.lower()) for word in sent if not word in set(stopwords.words('english'))]
    sent = " ".join(sent)
    corpus.append(sent)

### Coverting data to vector form

In [22]:
cv = CountVectorizer(max_features=5000,ngram_range=(1,3))
corpus_cv = cv.fit_transform(corpus).toarray()
print(cv.get_feature_names()[:20])

['abandon', 'abc', 'abduct', 'abe', 'abedin', 'abl', 'abort', 'about', 'about elect', 'about elect violenc', 'about emf', 'about emf damag', 'about hillari', 'about it', 'about the', 'about to', 'about trump', 'abov', 'abroad', 'absolut']


In [23]:
tfidf = TfidfVectorizer(max_features=5000,ngram_range=(1,3))
corpus_tfidf = tfidf.fit_transform(corpus).toarray()

### Splitting the data

In [30]:
from sklearn.model_selection import train_test_split

x_train_cv, x_test_cv, y_train_cv, y_test_cv = train_test_split(corpus_cv, y, test_size=0.2)
x_train_tfidf, x_test_tfidf, y_train_tfidf, y_test_tfidf = train_test_split(corpus_tfidf, y, test_size=0.2)

### Model

In [25]:
from sklearn.naive_bayes import MultinomialNB

modelNB = MultinomialNB()
modelNB.fit(x_train_cv, y_train_cv)

y_pred = modelNB.predict(x_test_cv)

print(f1_score(y_test_cv, y_pred))
print(classification_report(y_test_cv, y_pred))

  return f(*args, **kwargs)


0.8877364246491762
              precision    recall  f1-score   support

           0       0.94      0.88      0.91      2080
           1       0.86      0.92      0.89      1577

    accuracy                           0.90      3657
   macro avg       0.90      0.90      0.90      3657
weighted avg       0.90      0.90      0.90      3657



In [26]:
from sklearn.linear_model import PassiveAggressiveClassifier
linear_clf = PassiveAggressiveClassifier(max_iter=1000)
linear_clf.fit(x_train_cv, y_train_cv)

y_pred = linear_clf.predict(x_test_cv)

print(f1_score(y_test_cv, y_pred))
print(classification_report(y_test_cv, y_pred))

  return f(*args, **kwargs)


0.9185278389430639
              precision    recall  f1-score   support

           0       0.94      0.93      0.94      2080
           1       0.91      0.93      0.92      1577

    accuracy                           0.93      3657
   macro avg       0.93      0.93      0.93      3657
weighted avg       0.93      0.93      0.93      3657



In [27]:
from sklearn.linear_model import RidgeClassifier
modelR = RidgeClassifier()
modelR.fit(x_train_cv, y_train_cv)

y_pred = modelR.predict(x_test_cv)

print(f1_score(y_test_cv, y_pred))
print(classification_report(y_test_cv, y_pred))


  return f(*args, **kwargs)


0.9283338373147868
              precision    recall  f1-score   support

           0       0.98      0.91      0.94      2080
           1       0.89      0.97      0.93      1577

    accuracy                           0.94      3657
   macro avg       0.93      0.94      0.93      3657
weighted avg       0.94      0.94      0.94      3657



In [28]:
from sklearn.linear_model import SGDClassifier
modelSGD = SGDClassifier()
modelSGD.fit(x_train_cv, y_train_cv)

y_pred = modelSGD.predict(x_test_cv)

print(f1_score(y_test_cv, y_pred))
print(classification_report(y_test_cv, y_pred))

  return f(*args, **kwargs)


0.9296560272699101
              precision    recall  f1-score   support

           0       0.96      0.93      0.94      2080
           1       0.91      0.95      0.93      1577

    accuracy                           0.94      3657
   macro avg       0.94      0.94      0.94      3657
weighted avg       0.94      0.94      0.94      3657



In [31]:
from sklearn.linear_model import SGDClassifier
modelSGD = SGDClassifier()
modelSGD.fit(x_train_tfidf, y_train_tfidf)

y_pred = modelSGD.predict(x_test_tfidf)

print(f1_score(y_test_tfidf, y_pred))
print(classification_report(y_test_tfidf, y_pred))

  return f(*args, **kwargs)


0.9307645996387718
              precision    recall  f1-score   support

           0       0.99      0.90      0.94      2090
           1       0.88      0.99      0.93      1567

    accuracy                           0.94      3657
   macro avg       0.93      0.94      0.94      3657
weighted avg       0.94      0.94      0.94      3657



### LSTM model

In [12]:
from tensorflow.keras import Sequential
from tensorflow.keras.layers import Dense, Dropout, LSTM, Embedding, Bidirectional
from tensorflow.keras.preprocessing.text import one_hot
from tensorflow.keras.preprocessing.sequence import pad_sequences

In [13]:
vocab = 5000

In [14]:
onehot_repr = [one_hot(words, vocab) for words in corpus]

In [15]:
maxL = max([len(words) for words in onehot_repr])

In [16]:
padded_x = pad_sequences(onehot_repr, maxlen=maxL)

In [17]:
from sklearn.model_selection import train_test_split

x_train, x_test, y_train, y_test = train_test_split(padded_x, y, test_size=0.33)

In [18]:
modelLSTM = Sequential()

modelLSTM.add(Embedding(vocab, 40, input_length=maxL))
modelLSTM.add(Dropout(0.5))
modelLSTM.add(Bidirectional(LSTM(100)))
modelLSTM.add(Dropout(0.5))
modelLSTM.add(Dense(1, activation="sigmoid"))

In [19]:
modelLSTM.summary()

Model: "sequential"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding (Embedding)        (None, 49, 40)            200000    
_________________________________________________________________
dropout (Dropout)            (None, 49, 40)            0         
_________________________________________________________________
bidirectional (Bidirectional (None, 200)               112800    
_________________________________________________________________
dropout_1 (Dropout)          (None, 200)               0         
_________________________________________________________________
dense (Dense)                (None, 1)                 201       
Total params: 313,001
Trainable params: 313,001
Non-trainable params: 0
_________________________________________________________________


In [20]:
from tensorflow.keras.callbacks import ModelCheckpoint
from tensorflow.keras.optimizers import Adam

modelLSTM.compile(loss="binary_crossentropy", optimizer='adam', metrics=['accuracy'])
modelLSTM.fit(x_train, y_train, epochs=20, batch_size=100, validation_split=0.1)

Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20


<keras.callbacks.History at 0x7fddbff60400>

In [21]:
y_pred = modelLSTM.predict(x_test)
y_pred = np.round(y_pred).astype(int)
print('F1 score:',f1_score(y_test, y_pred))
print('\n\n\t\t\t classification_report\n\n',classification_report(y_test, y_pred))

F1 score: 0.9054333521338597


			 classification_report

               precision    recall  f1-score   support

           0       0.93      0.92      0.93      3393
           1       0.90      0.91      0.91      2642

    accuracy                           0.92      6035
   macro avg       0.91      0.92      0.92      6035
weighted avg       0.92      0.92      0.92      6035

