# Fake News Prediction using Word2Vec & Naive Bayes
Dataset: `/kaggle/input/fake-news/news.csv`

In [None]:

import pandas as pd
import numpy as np
import re
import nltk
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
from sklearn.naive_bayes import MultinomialNB
from gensim.models import Word2Vec
from tqdm import tqdm
tqdm.pandas()

nltk.download('stopwords')


## Load Dataset (Same as Template)

In [None]:

df = pd.read_csv('/kaggle/input/fake-news/news.csv')
df.head()


## Preprocessing

In [None]:

ps = PorterStemmer()
stop_words=set(stopwords.words('english'))

def clean_text(text):
    text = re.sub('[^a-zA-Z]', ' ', str(text))
    text = text.lower().split()
    text = [ps.stem(w) for w in text if w not in stop_words]
    return text

df['clean'] = df['text'].progress_apply(clean_text)
df.head()


## Train Word2Vec

In [None]:

w2v_model = Word2Vec(sentences=df['clean'], vector_size=100, window=5, min_count=2)


## Vectorize

In [None]:

def vectorize(tokens):
    v = np.zeros(100)
    c=0
    for w in tokens:
        if w in w2v_model.wv:
            v+=w2v_model.wv[w]
            c+=1
    return v/c if c>0 else v

df['vector'] = df['clean'].apply(vectorize)

X = np.vstack(df['vector'].values)
y = df['label']


## Train-test Split

In [None]:

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Adjust for Naive Bayes (non-negative)
shift = X_train.min()
X_train_nb = X_train - shift
X_test_nb = X_test - shift


## Train Naive Bayes

In [None]:

model = MultinomialNB()
model.fit(X_train_nb, y_train)


## Evaluation

In [None]:

pred = model.predict(X_test_nb)
print("Accuracy:", accuracy_score(y_test, pred))
print("\nClassification Report:\n", classification_report(y_test, pred))
print("\nConfusion Matrix:\n", confusion_matrix(y_test, pred))
