In [1]:
import pandas as pd
df = pd.read_csv('/kaggle/input/fake-news-detection/data.csv').dropna()
df.head()

Unnamed: 0,URLs,Headline,Body,Label
0,http://www.bbc.com/news/world-us-canada-414191...,Four ways Bob Corker skewered Donald Trump,Image copyright Getty Images\nOn Sunday mornin...,1
1,https://www.reuters.com/article/us-filmfestiva...,Linklater's war veteran comedy speaks to moder...,"LONDON (Reuters) - “Last Flag Flying”, a comed...",1
2,https://www.nytimes.com/2017/10/09/us/politics...,Trump’s Fight With Corker Jeopardizes His Legi...,The feud broke into public view last week when...,1
3,https://www.reuters.com/article/us-mexico-oil-...,Egypt's Cheiron wins tie-up with Pemex for Mex...,MEXICO CITY (Reuters) - Egypt’s Cheiron Holdin...,1
4,http://www.cnn.com/videos/cnnmoney/2017/10/08/...,Jason Aldean opens 'SNL' with Vegas tribute,"Country singer Jason Aldean, who was performin...",1


In [2]:
df.shape

(3988, 4)

In [3]:
import ydata_profiling as ydp

report = ydp.ProfileReport(df)
report

Summarize dataset:   0%|          | 0/5 [00:00<?, ?it/s]

Generate report structure:   0%|          | 0/1 [00:00<?, ?it/s]

Render HTML:   0%|          | 0/1 [00:00<?, ?it/s]



In [4]:
df["Head_Body"] = df["Headline"] + " " + df["Body"]
df = df.drop(["URLs", "Headline", "Body"], axis = 1)

In [5]:
df.head()

Unnamed: 0,Label,Head_Body
0,1,Four ways Bob Corker skewered Donald Trump Ima...
1,1,Linklater's war veteran comedy speaks to moder...
2,1,Trump’s Fight With Corker Jeopardizes His Legi...
3,1,Egypt's Cheiron wins tie-up with Pemex for Mex...
4,1,Jason Aldean opens 'SNL' with Vegas tribute Co...


In [6]:
print(df['Head_Body'].apply(type).value_counts())

Head_Body
<class 'str'>    3988
Name: count, dtype: int64


**Data cleaning**

In [7]:
import nltk
import re
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
nltk.download("punkt")
nltk.download("stopwords")

def clean_review(text):
    text = text.lower()

    text = re.sub(r'[^a-z\s]','', text)

    words = word_tokenize(text)

    stop_words = set(stopwords.words("english"))
    words = [word for word in words if word not in stop_words]

    clean_text = ' '.join(words)
    return clean_text

[nltk_data] Downloading package punkt to /usr/share/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /usr/share/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [8]:
df['news_cleaned'] = df['Head_Body'].apply(clean_review)

In [9]:
df.head()

Unnamed: 0,Label,Head_Body,news_cleaned
0,1,Four ways Bob Corker skewered Donald Trump Ima...,four ways bob corker skewered donald trump ima...
1,1,Linklater's war veteran comedy speaks to moder...,linklaters war veteran comedy speaks modern am...
2,1,Trump’s Fight With Corker Jeopardizes His Legi...,trumps fight corker jeopardizes legislative ag...
3,1,Egypt's Cheiron wins tie-up with Pemex for Mex...,egypts cheiron wins tieup pemex mexican onshor...
4,1,Jason Aldean opens 'SNL' with Vegas tribute Co...,jason aldean opens snl vegas tribute country s...


**Data Lemmatizing**

In [10]:
import spacy

nlp = spacy.load('en_core_web_sm')

def lemmatize_text(sentence):
  doc = nlp(sentence)
  lemmatized_words = [token.lemma_ for token in doc]
  return lemmatized_words

In [11]:
df['news_lem'] = df['news_cleaned'].apply(lemmatize_text)

In [12]:
df.head()

Unnamed: 0,Label,Head_Body,news_cleaned,news_lem
0,1,Four ways Bob Corker skewered Donald Trump Ima...,four ways bob corker skewered donald trump ima...,"[four, way, bob, corker, skewer, donald, trump..."
1,1,Linklater's war veteran comedy speaks to moder...,linklaters war veteran comedy speaks modern am...,"[linklater, war, veteran, comedy, speak, moder..."
2,1,Trump’s Fight With Corker Jeopardizes His Legi...,trumps fight corker jeopardizes legislative ag...,"[trump, fight, corker, jeopardize, legislative..."
3,1,Egypt's Cheiron wins tie-up with Pemex for Mex...,egypts cheiron wins tieup pemex mexican onshor...,"[egypt, cheiron, win, tieup, pemex, mexican, o..."
4,1,Jason Aldean opens 'SNL' with Vegas tribute Co...,jason aldean opens snl vegas tribute country s...,"[jason, aldean, open, snl, vegas, tribute, cou..."


**Word2Vec**

In [13]:
data = df['news_lem']

In [14]:
from gensim.models import Word2Vec

model = Word2Vec(data, vector_size=200, window=5, min_count=3, workers=3, sg=0, epochs=120, alpha=0.001)

In [15]:
similar_words = {}
for word in ['war', 'country', 'sense', 'president']:
    similar_words[word] = [similar[0] for similar in model.wv.most_similar(word, topn=10)]
    print(f"10 most similar words to '{word}': {similar_words[word]}")

10 most similar words to 'war': ['peaceful', 'afghanistan', 'syria', 'saudis', 'conflict', 'protectionist', 'iraq', 'crisis', 'ally', 'peace']
10 most similar words to 'country': ['region', 'conflict', 'monarch', 'community', 'americans', 'population', 'nation', 'immigrant', 'divide', 'illiberal']
10 most similar words to 'sense': ['choice', 'simply', 'difficult', 'necessarily', 'mind', 'realize', 'mistake', 'sort', 'real', 'habit']
10 most similar words to 'president': ['donald', 'barack', 'presidency', 'ivanka', 'obama', 'trumps', 'biden', 'tillerson', 'melania', 'mrs']


In [16]:
import numpy as np
def average_word_vectors(words_list, model, vocabulary, num_features):
    feature_vector = np.zeros((num_features,), dtype="float64")
    nwords = 0

    for word in words_list:
        if word in vocabulary:
            nwords += 1
            feature_vector = np.add(feature_vector, model.wv[word])
    if nwords:
        feature_vector = np.divide(feature_vector, nwords)
    return feature_vector

df['avg_vector'] = df['news_lem'].apply(lambda x: average_word_vectors(x, model, set(model.wv.index_to_key), 200))

**Fake news detection using ML algorithms**

In [17]:
from sklearn.model_selection import train_test_split
X = df['avg_vector'].to_list()
y = df['Label']

X_train, X_temp, y_train, y_temp = train_test_split(X, y, test_size=0.4, random_state=42)
X_val, X_test, y_val, y_test = train_test_split(X_temp, y_temp, test_size=0.5, random_state=42)

**SVM**

In [18]:
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from sklearn.model_selection import RandomizedSearchCV
from scipy.stats import uniform, randint

param_dist = {
    'C': uniform(loc=0, scale=4),
    'gamma': uniform(loc=0, scale=0.1),
    'kernel': ['linear', 'poly', 'rbf', 'sigmoid'],
    'degree': randint(0, 6)
}

def train_and_test_svm(params):
  svm = SVC()
  search = RandomizedSearchCV(svm, params, cv=10, n_iter = 20, scoring="accuracy")

  search.fit(X_train, y_train)

  y_pred = search.predict(X_val)
  y_pred_train = search.predict(X_train)

  print(f"Accuracy score on train dataset: {accuracy_score(y_train, y_pred_train)}")
  print(f"Accuracy score on validation dataset: {accuracy_score(y_val, y_pred)}")
  print(f"precision on validation dataset: {precision_score(y_val, y_pred)}")
  print(f"recall on validation dataset: {recall_score(y_val, y_pred)}")
  print(f"Parameters: { search.best_params_}")

In [19]:
train_and_test_svm(param_dist)

Accuracy score on train dataset: 0.9510869565217391
Accuracy score on validation dataset: 0.924812030075188
precision on validation dataset: 0.927170868347339
recall on validation dataset: 0.9068493150684932
Parameters: {'C': 3.628450522963837, 'degree': 4, 'gamma': 0.01821700753719373, 'kernel': 'linear'}


**Random Forest**

In [20]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV

params_grid_forest = {
    'max_depth': [3,5,7,10,12],
    'n_estimators': [50,100,150],
    'min_samples_leaf': [1,3,5,7]
}

def train_and_test_forest(params):
  forest = RandomForestClassifier()
  search = GridSearchCV(forest, params, cv=10, scoring="accuracy")

  search.fit(X_train, y_train)

  y_pred = search.predict(X_val)
  y_pred_train = search.predict(X_train)

  print(f"Accuracy score on train dataset: {accuracy_score(y_train, y_pred_train)}")
  print(f"Accuracy score on validation dataset: {accuracy_score(y_val, y_pred)}")
  print(f"precision on validation dataset: {precision_score(y_val, y_pred)}")
  print(f"recall on validation dataset: {recall_score(y_val, y_pred)}")
  print(f"Parameters: { search.best_params_}")

In [21]:
train_and_test_forest(params_grid_forest)

Accuracy score on train dataset: 0.9987458193979933
Accuracy score on validation dataset: 0.9285714285714286
precision on validation dataset: 0.9162162162162162
recall on validation dataset: 0.9287671232876712
Parameters: {'max_depth': 12, 'min_samples_leaf': 1, 'n_estimators': 150}


**XGBoost**

In [22]:
import xgboost as xgb
params_grid_xgb = {
    'max_depth': [5,7,10,12],
    'n_estimators': [10,50,60],
    'gamma': [0.1,0.5,1, 1.5],
    'learning_rate': [0.1,0.03,0.007]
}

def train_and_test_xgb(params):
  xgboost = xgb.XGBClassifier(objective='binary:logistic')
  search = GridSearchCV(xgboost, params, cv=10, scoring="accuracy")

  search.fit(X_train, y_train)

  y_pred = search.predict(X_val)
  y_pred_train = search.predict(X_train)

  print(f"Accuracy score on train dataset: {accuracy_score(y_train, y_pred_train)}")
  print(f"Accuracy score on validation dataset: {accuracy_score(y_val, y_pred)}")
  print(f"precision on validation dataset: {precision_score(y_val, y_pred)}")
  print(f"recall on validation dataset: {recall_score(y_val, y_pred)}")
  print(f"Parameters: { search.best_params_}")

In [23]:
train_and_test_xgb(params_grid_xgb)

Accuracy score on train dataset: 0.9970735785953178
Accuracy score on validation dataset: 0.9298245614035088
precision on validation dataset: 0.9209809264305178
recall on validation dataset: 0.9260273972602739
Parameters: {'gamma': 0.1, 'learning_rate': 0.1, 'max_depth': 5, 'n_estimators': 60}


**Final model**

In [24]:
from sklearn.metrics import classification_report

params = {'gamma': 0.1, 'learning_rate': 0.1, 'max_depth': 7, 'n_estimators': 60}
xgboost = xgb.XGBClassifier(objective='binary:logistic', **params)
xgboost.fit(X_train, y_train)
y_pred = xgboost.predict(X_test)

print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.96      0.94      0.95       442
           1       0.93      0.95      0.94       356

    accuracy                           0.95       798
   macro avg       0.95      0.95      0.95       798
weighted avg       0.95      0.95      0.95       798

