In [2]:
import pandas as pd
import numpy as np

In [3]:
train_data = pd.read_csv("fake-news/train.csv")
test_data = pd.read_csv("fake-news/test.csv")

In [4]:
train_data.isna().sum() 

id           0
title      558
author    1957
text        39
label        0
dtype: int64

### Processing data

In [5]:
train_data = train_data.dropna()

In [6]:
y = train_data[['label']].reset_index(drop=True)
x = train_data.drop('label', axis=1).reset_index(drop=True)

In [7]:
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer, HashingVectorizer
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
import re
from nltk.stem.porter import PorterStemmer

In [8]:
ps = PorterStemmer()

corpus = []
for sent in x['title']:
    sent = re.sub(r"[^a-zA-Z]"," ",sent)
    sent = word_tokenize(sent)
    sent = [ps.stem(word.lower()) for word in sent if not word in set(stopwords.words('english'))]
    sent = " ".join(sent)
    corpus.append(sent)

### Coverting data to vector form

In [9]:
cv = CountVectorizer(max_features=5000,ngram_range=(1,3))
corpus_cv = cv.fit_transform(corpus).toarray()
print(cv.get_feature_names()[:20])

['abandon', 'abc', 'abduct', 'abe', 'abedin', 'abl', 'abort', 'about', 'about elect', 'about elect violenc', 'about emf', 'about emf damag', 'about hillari', 'about it', 'about the', 'about to', 'about trump', 'abov', 'abroad', 'absolut']


In [10]:
tfidf = TfidfVectorizer(max_features=5000,ngram_range=(1,3))
corpus_tfidf = tfidf.fit_transform(corpus).toarray()

### Splitting the data

In [29]:
from sklearn.model_selection import train_test_split

x_train_cv, x_test_cv, y_train_cv, y_test_cv = train_test_split(corpus_cv, y, test_size=0.2)
x_train_tfidf, x_test_tfidf, y_train_tfidf, y_test_tfidf = train_test_split(corpus_tfidf, y, test_size=0.2)

### Model

In [30]:
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import classification_report, f1_score

modelNB = MultinomialNB()
modelNB.fit(x_train_cv, y_train_cv)

y_pred = modelNB.predict(x_test_cv)

print(f1_score(y_test_cv, y_pred))
print(classification_report(y_test_cv, y_pred))

  return f(*args, **kwargs)


0.8995739500912964
              precision    recall  f1-score   support

           0       0.94      0.90      0.92      2056
           1       0.88      0.92      0.90      1601

    accuracy                           0.91      3657
   macro avg       0.91      0.91      0.91      3657
weighted avg       0.91      0.91      0.91      3657



In [31]:
from sklearn.linear_model import PassiveAggressiveClassifier
linear_clf = PassiveAggressiveClassifier(max_iter=1000)
linear_clf.fit(x_train_cv, y_train_cv)

y_pred = linear_clf.predict(x_test_cv)

print(f1_score(y_test_cv, y_pred))
print(classification_report(y_test_cv, y_pred))

  return f(*args, **kwargs)


0.9075524913820119
              precision    recall  f1-score   support

           0       0.93      0.93      0.93      2056
           1       0.91      0.90      0.91      1601

    accuracy                           0.92      3657
   macro avg       0.92      0.92      0.92      3657
weighted avg       0.92      0.92      0.92      3657



In [32]:
from sklearn.linear_model import RidgeClassifier
modelR = RidgeClassifier()
modelR.fit(x_train_cv, y_train_cv)

y_pred = modelR.predict(x_test_cv)

print(f1_score(y_test_cv, y_pred))
print(classification_report(y_test_cv, y_pred))


  return f(*args, **kwargs)


0.9282725642558278
              precision    recall  f1-score   support

           0       0.97      0.91      0.94      2056
           1       0.89      0.97      0.93      1601

    accuracy                           0.93      3657
   macro avg       0.93      0.94      0.93      3657
weighted avg       0.94      0.93      0.93      3657



In [33]:
from sklearn.linear_model import SGDClassifier
modelSGD = SGDClassifier()
modelSGD.fit(x_train_cv, y_train_cv)

y_pred = modelSGD.predict(x_test_cv)

print(f1_score(y_test_cv, y_pred))
print(classification_report(y_test_cv, y_pred))

  return f(*args, **kwargs)


0.9275184275184275
              precision    recall  f1-score   support

           0       0.95      0.93      0.94      2056
           1       0.91      0.94      0.93      1601

    accuracy                           0.94      3657
   macro avg       0.93      0.94      0.93      3657
weighted avg       0.94      0.94      0.94      3657



In [35]:
from sklearn.linear_model import SGDClassifier
modelSGD = SGDClassifier()
modelSGD.fit(x_train_tfidf, y_train_tfidf)

y_pred = modelSGD.predict(x_test_tfidf)

print(f1_score(y_test_tfidf, y_pred))
print(classification_report(y_test_tfidf, y_pred))

  return f(*args, **kwargs)


0.9262270400481782
              precision    recall  f1-score   support

           0       0.98      0.90      0.94      2082
           1       0.88      0.98      0.93      1575

    accuracy                           0.93      3657
   macro avg       0.93      0.94      0.93      3657
weighted avg       0.94      0.93      0.93      3657

