In [1]:
pip install pandas numpy scikit-learn matplotlib seaborn nltk

Note: you may need to restart the kernel to use updated packages.


In [2]:
import pandas as pd

# Load the data
df_fake = pd.read_csv(r'C:\Users\india\Documents\fake_news_detection\Fake.csv')
df_real = pd.read_csv(r'C:\Users\india\Documents\fake_news_detection\True.csv')

# Add labels
df_fake['label'] = 0  # Fake
df_real['label'] = 1  # Real

# Combine datasets
df = pd.concat([df_fake, df_real], ignore_index=True)
df = df.sample(frac=1).reset_index(drop=True)  # Shuffle the data

# Check structure
df.head()


Unnamed: 0,title,text,subject,date,label
0,Trump to back Palestinian 'self-determination'...,WASHINGTON (Reuters) - President Donald Trump ...,politicsNews,"May 12, 2017",1
1,Desperate travelers crowd Puerto Rico airport ...,"SAN JUAN, Puerto Rico (Reuters) - Hundreds of ...",worldnews,"September 25, 2017",1
2,Five Cops Handcuff Dr. Dre At His Home Becaus...,If Dr. Dre were a white guy the police would h...,News,"July 26, 2016",0
3,Two Russian soldiers killed by shelling in Syr...,MOSCOW (Reuters) - Two Russian servicemen have...,worldnews,"September 4, 2017",1
4,UNDERCOVER NYPD COP Busts 2 Women Building Bom...,Thank goodness these Muslim women have a frien...,politics,"Nov 1, 2015",0


In [5]:
import nltk
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
import string

nltk.download('stopwords')

stop_words = set(stopwords.words('english'))
stemmer = PorterStemmer()

def clean_text(text):
    text = text.lower()
    text = "".join([c for c in text if c not in string.punctuation])
    tokens = text.split()
    tokens = [stemmer.stem(word) for word in tokens if word not in stop_words]
    return " ".join(tokens)

df['clean_text'] = df['text'].apply(clean_text)
df[['text', 'clean_text']].head(3)

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\india\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


Unnamed: 0,text,clean_text
0,WASHINGTON (Reuters) - President Donald Trump ...,washington reuter presid donald trump express ...
1,"SAN JUAN, Puerto Rico (Reuters) - Hundreds of ...",san juan puerto rico reuter hundr strand touri...
2,If Dr. Dre were a white guy the police would h...,dr dre white guy polic would treat total diffe...


In [6]:
from sklearn.model_selection import train_test_split

# X = cleaned news text
X = df['clean_text']

# y = label (0 for fake, 1 for real)
y = df['label']

# Split data: 80% training, 20% testing
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)
print("Training set size:", len(X_train))
print("Test set size:", len(X_test))
print("Sample training data:\n", X_train.head())
print("Sample training labels:\n", y_train.head())


Training set size: 35918
Test set size: 8980
Sample training data:
 36335                                                     
12384    windhoek reuter namibia govern tuesday reject ...
24419    philadelphia reuter democrat convent unfold on...
24740    washington reuter obama administr thursday unv...
27039    presid trump prove run white hous vastli diffe...
Name: clean_text, dtype: object
Sample training labels:
 36335    0
12384    1
24419    1
24740    1
27039    0
Name: label, dtype: int64


In [8]:
from sklearn.feature_extraction.text import TfidfVectorizer

# Create a TF-IDF vectorizer with max 5000 words
tfidf = TfidfVectorizer(max_features=5000)

# Fit on training data and transform both train & test
X_train_tfidf = tfidf.fit_transform(X_train)
X_test_tfidf = tfidf.transform(X_test)
print("TF-IDF train shape:", X_train_tfidf.shape)
print("TF-IDF test shape:", X_test_tfidf.shape)


TF-IDF train shape: (35918, 5000)
TF-IDF test shape: (8980, 5000)


In [9]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report

# Create and train the model
model = LogisticRegression()
model.fit(X_train_tfidf, y_train)

# Predict on test data
y_pred = model.predict(X_test_tfidf)

# Evaluate the model
print("Accuracy:", accuracy_score(y_test, y_pred))
print("Classification Report:\n", classification_report(y_test, y_pred))


Accuracy: 0.9891982182628062
Classification Report:
               precision    recall  f1-score   support

           0       0.99      0.99      0.99      4685
           1       0.99      0.99      0.99      4295

    accuracy                           0.99      8980
   macro avg       0.99      0.99      0.99      8980
weighted avg       0.99      0.99      0.99      8980



In [10]:
import joblib

# Save model
joblib.dump(model, 'fake_news_model.pkl')

# Save TF-IDF vectorizer
joblib.dump(tfidf, 'tfidf_vectorizer.pkl')


['tfidf_vectorizer.pkl']

In [11]:
# Load model and vectorizer
loaded_model = joblib.load('fake_news_model.pkl')
loaded_vectorizer = joblib.load('tfidf_vectorizer.pkl')

# Example input
news_text = "Government announces new plans for education reform"

# Preprocess and predict
cleaned = clean_text(news_text)  # use your clean_text() function
vector = loaded_vectorizer.transform([cleaned])
prediction = loaded_model.predict(vector)

print("Prediction:", "Real" if prediction[0] == 1 else "Fake")


Prediction: Fake
