In [1]:
import pandas as pd
import numpy as np

# Load datasets
fake = pd.read_csv("Fake.csv")
true = pd.read_csv("True.csv")

# Add labels
fake["label"] = 0   # Fake = 0
true["label"] = 1   # Real = 1

# Combine datasets
df = pd.concat([fake, true], axis=0)

# Shuffle dataset
df = df.sample(frac=1, random_state=42)

# Reset index
df.reset_index(drop=True, inplace=True)

df.head()

Unnamed: 0,title,text,subject,date,label
0,Ben Stein Calls Out 9th Circuit Court: Committ...,"21st Century Wire says Ben Stein, reputable pr...",US_News,"February 13, 2017",0
1,Trump drops Steve Bannon from National Securit...,WASHINGTON (Reuters) - U.S. President Donald T...,politicsNews,"April 5, 2017",1
2,Puerto Rico expects U.S. to lift Jones Act shi...,(Reuters) - Puerto Rico Governor Ricardo Rosse...,politicsNews,"September 27, 2017",1
3,OOPS: Trump Just Accidentally Confirmed He Le...,"On Monday, Donald Trump once again embarrassed...",News,"May 22, 2017",0
4,Donald Trump heads for Scotland to reopen a go...,"GLASGOW, Scotland (Reuters) - Most U.S. presid...",politicsNews,"June 24, 2016",1


In [2]:
df.shape

(44898, 5)

In [3]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 44898 entries, 0 to 44897
Data columns (total 5 columns):
 #   Column   Non-Null Count  Dtype 
---  ------   --------------  ----- 
 0   title    44898 non-null  object
 1   text     44898 non-null  object
 2   subject  44898 non-null  object
 3   date     44898 non-null  object
 4   label    44898 non-null  int64 
dtypes: int64(1), object(4)
memory usage: 1.7+ MB


In [4]:
df["label"].value_counts()

label
0    23481
1    21417
Name: count, dtype: int64

In [5]:
!pip install nltk




[notice] A new release of pip is available: 25.3 -> 26.0.1
[notice] To update, run: python.exe -m pip install --upgrade pip


In [6]:
import nltk
nltk.download('stopwords')
nltk.download('punkt')

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\HP\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\HP\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [7]:
import re
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer

In [8]:
stemmer = PorterStemmer()
stop_words = set(stopwords.words('english'))

def clean_text(text):
    
    # Convert to lowercase
    text = text.lower()
    
    # Remove numbers
    text = re.sub(r'\d+', '', text)
    
    # Remove punctuation
    text = re.sub(r'[^\w\s]', '', text)
    
    # Tokenization
    words = text.split()
    
    # Remove stopwords & apply stemming
    words = [stemmer.stem(word) for word in words if word not in stop_words]
    
    return " ".join(words)

In [9]:
df["cleaned_text"] = df["text"].apply(clean_text)

In [10]:
df[["text", "cleaned_text"]].head()

Unnamed: 0,text,cleaned_text
0,"21st Century Wire says Ben Stein, reputable pr...",st centuri wire say ben stein reput professor ...
1,WASHINGTON (Reuters) - U.S. President Donald T...,washington reuter us presid donald trump remov...
2,(Reuters) - Puerto Rico Governor Ricardo Rosse...,reuter puerto rico governor ricardo rossello s...
3,"On Monday, Donald Trump once again embarrassed...",monday donald trump embarrass countri accident...
4,"GLASGOW, Scotland (Reuters) - Most U.S. presid...",glasgow scotland reuter us presidenti candid g...


In [11]:
from sklearn.feature_extraction.text import TfidfVectorizer

In [12]:
vectorizer = TfidfVectorizer(
    max_features=5000,      # Use top 5000 important words
    ngram_range=(1,2)       # Use unigrams + bigrams
)

In [13]:
X = vectorizer.fit_transform(df["cleaned_text"])
y = df["label"]

In [14]:
X.shape

(44898, 5000)

In [15]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(
    X, y,
    test_size=0.2,
    random_state=42
)

In [16]:
from sklearn.linear_model import LogisticRegression

model = LogisticRegression(max_iter=1000)
model.fit(X_train, y_train)

0,1,2
,penalty,'l2'
,dual,False
,tol,0.0001
,C,1.0
,fit_intercept,True
,intercept_scaling,1
,class_weight,
,random_state,
,solver,'lbfgs'
,max_iter,1000


In [17]:
y_pred = model.predict(X_test)

In [18]:
from sklearn.metrics import accuracy_score

accuracy = accuracy_score(y_test, y_pred)
print("Accuracy:", accuracy)

Accuracy: 0.9875278396436525


In [19]:
from sklearn.metrics import classification_report

print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.99      0.98      0.99      4710
           1       0.98      0.99      0.99      4270

    accuracy                           0.99      8980
   macro avg       0.99      0.99      0.99      8980
weighted avg       0.99      0.99      0.99      8980



In [21]:
from sklearn.metrics import confusion_matrix

confusion_matrix(y_test, y_pred)

array([[4632,   78],
       [  34, 4236]])

In [22]:
feature_names = vectorizer.get_feature_names_out()

In [23]:
import numpy as np

coefficients = model.coef_[0]

In [24]:
top_real_indices = np.argsort(coefficients)[-20:]
top_real_words = [feature_names[i] for i in top_real_indices]

print("Top words indicating REAL news:")
print(top_real_words)

Top words indicating REAL news:
['that', 'said statement', 'minist', 'us', 'im', 'nov', 'edt', 'monday', 'dont', 'washington', 'friday', 'thursday', 'us presid', 'tuesday', 'wednesday', 'presid donald', 'reuter us', 'washington reuter', 'said', 'reuter']


In [25]:
top_fake_indices = np.argsort(coefficients)[:20]
top_fake_words = [feature_names[i] for i in top_fake_indices]

print("Top words indicating FAKE news:")
print(top_fake_words)

Top words indicating FAKE news:
['via', 'imag', 'presid trump', 'mr', 'gop', 'hillari', 'american', 'imag via', 'even', 'obama', 'wire', 'america', 'rep', 'sen', 'know', 'like', 'presid obama', 'isi', 'claim', 'daili']


In [26]:
import pickle

# Save model
pickle.dump(model, open("fake_news_model.pkl", "wb"))

# Save vectorizer
pickle.dump(vectorizer, open("tfidf_vectorizer.pkl", "wb"))