# Step by step Training
1. Data preprocessing & cleaning
2. Trainâ€“test split
3. Feature extraction (BoW, TF-IDF, Word2Vec)
4. Machine learning model training & comparison

In [1]:
import pandas as pd
import numpy as np
import re

In [2]:
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import MultinomialNB
from sklearn.svm import LinearSVC
from sklearn.metrics import accuracy_score, classification_report

In [3]:
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer

from gensim.models import Word2Vec

nltk.download('stopwords')
nltk.download('wordnet')

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Hp\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\Hp\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [6]:
#load dataset
fake_df = pd.read_csv('D:\Fake_News_predictor\data\Fake.csv')
true_df = pd.read_csv('D:\Fake_News_predictor\data\True.csv')


fake_df['label'] = 0
true_df['label'] = 1


df = pd.concat([fake_df, true_df], axis=0)
df = df.sample(frac=1, random_state=42).reset_index(drop=True)


df = df[['title', 'text', 'label']]
df.head()

  fake_df = pd.read_csv('D:\Fake_News_predictor\data\Fake.csv')
  true_df = pd.read_csv('D:\Fake_News_predictor\data\True.csv')


Unnamed: 0,title,text,label
0,Ben Stein Calls Out 9th Circuit Court: Committ...,"21st Century Wire says Ben Stein, reputable pr...",0
1,Trump drops Steve Bannon from National Securit...,WASHINGTON (Reuters) - U.S. President Donald T...,1
2,Puerto Rico expects U.S. to lift Jones Act shi...,(Reuters) - Puerto Rico Governor Ricardo Rosse...,1
3,OOPS: Trump Just Accidentally Confirmed He Le...,"On Monday, Donald Trump once again embarrassed...",0
4,Donald Trump heads for Scotland to reopen a go...,"GLASGOW, Scotland (Reuters) - Most U.S. presid...",1


In [10]:
df.shape

(44898, 4)

In [7]:
#Data preprocessing & cleaning
stop_words = set(stopwords.words('english'))
lemmatizer = WordNetLemmatizer()

In [8]:
def clean_text(text):
    text = text.lower()
    text = re.sub(r'[^a-zA-Z]', ' ', text)
    tokens = text.split()
    tokens = [lemmatizer.lemmatize(word) for word in tokens if word not in stop_words]
    return ' '.join(tokens)

In [9]:
df['content'] = df['title'] + ' ' + df['text']
df['content'] = df['content'].apply(clean_text)

X = df['content']
y = df['label']

In [11]:
#train test split
X_train, X_test, y_train, y_test = train_test_split(
X, y, test_size=0.2, random_state=42, stratify=y
)

In [12]:
#Feature extraction
#bag of words
bow = CountVectorizer(max_features=5000)
X_train_bow = bow.fit_transform(X_train)
X_test_bow = bow.transform(X_test)

In [13]:
#tfidf
tfidf = TfidfVectorizer(max_features=5000, ngram_range=(1,2))
X_train_tfidf = tfidf.fit_transform(X_train)
X_test_tfidf = tfidf.transform(X_test)

In [14]:
# word2vec
sentences = [text.split() for text in X_train]
w2v_model = Word2Vec(sentences, vector_size=100, window=5, min_count=2)

In [15]:
def avg_word2vec(text, model, vector_size=100):
    words = text.split()
    vectors = [model.wv[word] for word in words if word in model.wv]
    if len(vectors) == 0:
        return np.zeros(vector_size)
    return np.mean(vectors, axis=0)

In [16]:
X_train_w2v = np.array([avg_word2vec(text, w2v_model) for text in X_train])
X_test_w2v = np.array([avg_word2vec(text, w2v_model) for text in X_test])

In [17]:
## machine learning algorithms
from sklearn.metrics import confusion_matrix
lr = LogisticRegression(max_iter=1000)
# TF-IDF (best for LR)
lr.fit(X_train_tfidf, y_train)
y_pred_lr = lr.predict(X_test_tfidf)
print('Logistic Regression Accuracy (TF-IDF):', accuracy_score(y_test, y_pred_lr))
print(confusion_matrix(y_test, y_pred_lr))
print(classification_report(y_test, y_pred_lr))

Logistic Regression Accuracy (TF-IDF): 0.9897550111358575
[[4631   65]
 [  27 4257]]
              precision    recall  f1-score   support

           0       0.99      0.99      0.99      4696
           1       0.98      0.99      0.99      4284

    accuracy                           0.99      8980
   macro avg       0.99      0.99      0.99      8980
weighted avg       0.99      0.99      0.99      8980



In [18]:
lr = LogisticRegression(max_iter=1000)
# BoW
lr.fit(X_train_bow, y_train)
print('BoW Accuracy:', accuracy_score(y_test, lr.predict(X_test_bow)))

# TF-IDF
lr.fit(X_train_tfidf, y_train)
print('TF-IDF Accuracy:', accuracy_score(y_test, lr.predict(X_test_tfidf)))

# Word2Vec
lr.fit(X_train_w2v, y_train)
print('Word2Vec Accuracy:', accuracy_score(y_test, lr.predict(X_test_w2v)))

BoW Accuracy: 0.9955456570155902
TF-IDF Accuracy: 0.9897550111358575
Word2Vec Accuracy: 0.9752783964365256


In [20]:
#naive bayes
nb = MultinomialNB()


# Bag of Words
nb.fit(X_train_bow, y_train)
y_pred_nb_bow = nb.predict(X_test_bow)


print('Naive Bayes Accuracy (BoW):', accuracy_score(y_test, y_pred_nb_bow))
print(confusion_matrix(y_test, y_pred_nb_bow))
print(classification_report(y_test, y_pred_nb_bow))


# TF-IDF
nb.fit(X_train_tfidf, y_train)
y_pred_nb_tfidf = nb.predict(X_test_tfidf)


print('Naive Bayes Accuracy (TF-IDF):', accuracy_score(y_test, y_pred_nb_tfidf))
print(confusion_matrix(y_test, y_pred_nb_tfidf))
print(classification_report(y_test, y_pred_nb_tfidf))

Naive Bayes Accuracy (BoW): 0.9521158129175946
[[4476  220]
 [ 210 4074]]
              precision    recall  f1-score   support

           0       0.96      0.95      0.95      4696
           1       0.95      0.95      0.95      4284

    accuracy                           0.95      8980
   macro avg       0.95      0.95      0.95      8980
weighted avg       0.95      0.95      0.95      8980

Naive Bayes Accuracy (TF-IDF): 0.9482182628062361
[[4460  236]
 [ 229 4055]]
              precision    recall  f1-score   support

           0       0.95      0.95      0.95      4696
           1       0.95      0.95      0.95      4284

    accuracy                           0.95      8980
   macro avg       0.95      0.95      0.95      8980
weighted avg       0.95      0.95      0.95      8980



In [None]:
nb = MultinomialNB()

nb.fit(X_train_bow, y_train)
print('BoW Accuracy:', accuracy_score(y_test, nb.predict(X_test_bow)))

nb.fit(X_train_tfidf, y_train)
print('TF-IDF Accuracy:', accuracy_score(y_test, nb.predict(X_test_tfidf)))

BoW Accuracy: 0.9521158129175946
TF-IDF Accuracy: 0.9482182628062361
