Import libraries

In [14]:
# Data handling
import pandas as pd
import numpy as np

# Text preprocessing
import re

# ML utilities
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.svm import LinearSVC

# Evaluation
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

# Save models
import pickle


importing data
https://www.kaggle.com/datasets/emineyetm/fake-news-detection-datasets

In [26]:

# Load Fake news data
fake_df = pd.read_csv(
    "/content/Fake.csv",
    encoding="utf-8",
    engine="python",
    on_bad_lines="skip"
)

# Load Real news data
true_df = pd.read_csv(
    "/True.csv",
    encoding="utf-8",
    engine="python",
    on_bad_lines="skip"
)

print(fake_df.head())
print(true_df.head())


                                               title  \
0   Donald Trump Sends Out Embarrassing New Year’...   
1   Drunk Bragging Trump Staffer Started Russian ...   
2   Sheriff David Clarke Becomes An Internet Joke...   
3   Trump Is So Obsessed He Even Has Obama’s Name...   
4   Pope Francis Just Called Out Donald Trump Dur...   

                                                text subject  \
0  Donald Trump just couldn t wish all Americans ...    News   
1  House Intelligence Committee Chairman Devin Nu...    News   
2  On Friday, it was revealed that former Milwauk...    News   
3  On Christmas day, Donald Trump announced that ...    News   
4  Pope Francis used his annual Christmas Day mes...    News   

                date  
0  December 31, 2017  
1  December 31, 2017  
2  December 30, 2017  
3  December 29, 2017  
4  December 25, 2017  
                                               title  \
0  As U.S. budget fight looms, Republicans flip t...   
1  U.S. military to accept t

In [17]:
# Assign labels
fake_df["label"] = 1   # Fake
true_df["label"] = 0   # Real

# Combine datasets
df = pd.concat([fake_df, true_df], ignore_index=True)

# Shuffle the dataset
df = df.sample(frac=1, random_state=42).reset_index(drop=True)

df.info()

#No ordering bias and Proper class mixing


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 36838 entries, 0 to 36837
Data columns (total 5 columns):
 #   Column   Non-Null Count  Dtype 
---  ------   --------------  ----- 
 0   title    36838 non-null  object
 1   text     36838 non-null  object
 2   subject  36838 non-null  object
 3   date     36838 non-null  object
 4   label    36838 non-null  int64 
dtypes: int64(1), object(4)
memory usage: 1.4+ MB


Clean the text data
Removes:URLs, Punctuation, Extra whitespace
Keeps: Meaningful words, Sentence structure

In [18]:
def clean_text(text):
    text = text.lower()                             # Lowercase
    text = re.sub(r"http\S+", "", text)             # Remove URLs
    text = re.sub(r"[^a-z\s]", "", text)            # Remove punctuation/numbers
    text = re.sub(r"\s+", " ", text).strip()        # Remove extra spaces
    return text

# Apply cleaning
df["clean_text"] = df["text"].apply(clean_text)

df[["text", "clean_text"]].head()


Unnamed: 0,text,clean_text
0,Virginia Congressman Dave Brat is feeling the ...,virginia congressman dave brat is feeling the ...
1,"Donald Trump and his incompetent, corrupt team...",donald trump and his incompetent corrupt team ...
2,GAZA (Reuters) - The new chief of Palestinian ...,gaza reuters the new chief of palestinian isla...
3,"RANCHO MIRAGE, Calif. (Reuters) - President Ba...",rancho mirage calif reuters president barack o...
4,The greatest love affair in the history of th...,the greatest love affair in the history of the...


Train-test split

*   Splits data into training and testing sets


*  Preserves class balance using stratification



In [19]:
X = df["clean_text"]
y = df["label"]

X_train, X_test, y_train, y_test = train_test_split(
    X,
    y,
    test_size=0.2,
    random_state=42,
    stratify=y
)


Converting text to numbers using TF-IDF


*   Converts text into numerical vectors
*   Captures important words and phrases


*   Suppresses common, useless words




In [21]:
tfidf = TfidfVectorizer(
    max_features=50000,
    ngram_range=(1, 2),
    stop_words="english"
)

# Fit only on training data
X_train_tfidf = tfidf.fit_transform(X_train)

# Transform test data
X_test_tfidf = tfidf.transform(X_test)


Train Linear SVM model

Learns a decision boundary between fake and real news

Handles high-dimensional sparse text extremely well

In [22]:
svm_model = LinearSVC(
    C=1.0,
    class_weight="balanced",
    random_state=42
)

# Train the model
svm_model.fit(X_train_tfidf, y_train)


 Linear SVM is:

Very stable

Strong performance on text

and often beats Logistic Regression

#Evaluate the model

In [23]:
# Predict on test data
y_pred = svm_model.predict(X_test_tfidf)

# Accuracy
print("Accuracy:", accuracy_score(y_test, y_pred))

# Detailed metrics
print("\nClassification Report:")
print(classification_report(y_test, y_pred))

# Confusion matrix
print("\nConfusion Matrix:")
print(confusion_matrix(y_test, y_pred))


Accuracy: 0.993485342019544

Classification Report:
              precision    recall  f1-score   support

           0       0.99      1.00      0.99      4284
           1       0.99      0.99      0.99      3084

    accuracy                           0.99      7368
   macro avg       0.99      0.99      0.99      7368
weighted avg       0.99      0.99      0.99      7368


Confusion Matrix:
[[4267   17]
 [  31 3053]]


Saving the model and vectorizering it

In [24]:
# Save SVM model
with open("/content/svm_fake_news_model.pkl", "wb") as f:
    pickle.dump(svm_model, f)

# Save TF-IDF vectorizer
with open("/content/tfidf_vectorizer.pkl", "wb") as f:
    pickle.dump(tfidf, f)

print("Model and vectorizer saved successfully.")


Model and vectorizer saved successfully.
