In [1]:
import pandas as pd
import numpy as np

In [2]:
train_data = pd.read_csv("fake-news/train.csv")
test_data = pd.read_csv("fake-news/test.csv")

In [3]:
train_data.isna().sum() 

id           0
title      558
author    1957
text        39
label        0
dtype: int64

### Processing data

In [4]:
train_data = train_data.dropna()

In [5]:
y = train_data[['label']].reset_index(drop=True)
x = train_data.drop('label', axis=1).reset_index(drop=True)

In [6]:
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer, HashingVectorizer
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
import re
from nltk.stem.porter import PorterStemmer

In [7]:
ps = PorterStemmer()

corpus = []
for sent in x['title']:
    sent = re.sub(r"[^a-zA-Z]"," ",sent)
    sent = word_tokenize(sent)
    sent = [ps.stem(word.lower()) for word in sent if not word in set(stopwords.words('english'))]
    sent = " ".join(sent)
    corpus.append(sent)

### Coverting data to vector form

In [8]:
cv = CountVectorizer(max_features=5000,ngram_range=(1,3))
corpus = cv.fit_transform(corpus).toarray()
print(cv.get_feature_names()[:20])

['abandon', 'abc', 'abduct', 'abe', 'abedin', 'abl', 'abort', 'about', 'about elect', 'about elect violenc', 'about emf', 'about emf damag', 'about hillari', 'about it', 'about the', 'about to', 'about trump', 'abov', 'abroad', 'absolut']


### Splitting the data

In [9]:
from sklearn.model_selection import train_test_split

x_train, x_test, y_train, y_test = train_test_split(corpus, y, test_size=0.2)

### Model

In [10]:
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import classification_report, f1_score

modelNB = MultinomialNB()
modelNB.fit(x_train, y_train)

y_pred = modelNB.predict(x_test)

print(f1_score(y_test, y_pred))
print(classification_report(y_test, y_pred))

  return f(*args, **kwargs)


0.9009840098400985
              precision    recall  f1-score   support

           0       0.94      0.90      0.92      2075
           1       0.88      0.93      0.90      1582

    accuracy                           0.91      3657
   macro avg       0.91      0.91      0.91      3657
weighted avg       0.91      0.91      0.91      3657



In [11]:
from sklearn.linear_model import PassiveAggressiveClassifier
linear_clf = PassiveAggressiveClassifier(max_iter=1000)
linear_clf.fit(x_train, y_train)

y_pred = linear_clf.predict(x_test)

print(f1_score(y_test, y_pred))
print(classification_report(y_test, y_pred))

  return f(*args, **kwargs)


0.9170638703527169
              precision    recall  f1-score   support

           0       0.93      0.94      0.94      2075
           1       0.92      0.91      0.92      1582

    accuracy                           0.93      3657
   macro avg       0.93      0.93      0.93      3657
weighted avg       0.93      0.93      0.93      3657



In [12]:
from sklearn.linear_model import RidgeClassifier
modelR = RidgeClassifier()
modelR.fit(x_train, y_train)

y_pred = modelR.predict(x_test)

print(f1_score(y_test, y_pred))
print(classification_report(y_test, y_pred))


  return f(*args, **kwargs)


0.9254901960784315
              precision    recall  f1-score   support

           0       0.98      0.90      0.94      2075
           1       0.89      0.97      0.93      1582

    accuracy                           0.93      3657
   macro avg       0.93      0.94      0.93      3657
weighted avg       0.94      0.93      0.93      3657



In [13]:
from sklearn.linear_model import SGDClassifier
modelSGD = SGDClassifier()
modelSGD.fit(x_train, y_train)

y_pred = modelSGD.predict(x_test)

print(f1_score(y_test, y_pred))
print(classification_report(y_test, y_pred))

  return f(*args, **kwargs)


0.9308527131782944
              precision    recall  f1-score   support

           0       0.96      0.93      0.95      2075
           1       0.91      0.95      0.93      1582

    accuracy                           0.94      3657
   macro avg       0.94      0.94      0.94      3657
weighted avg       0.94      0.94      0.94      3657

