## Importing Libraries

In [1]:
import pandas as pd          # to read and handle data (like Excel)
import re                    # to clean text
import nltk                  # natural language toolkit for text processing
from sklearn.model_selection import train_test_split   # split into train/test
from sklearn.feature_extraction.text import TfidfVectorizer # convert text → numbers
from sklearn.linear_model import LogisticRegression   # ML model
from sklearn.svm import LinearSVC                     # another ML model (SVM)
from sklearn.metrics import accuracy_score, f1_score, classification_report

## Loading the dataset

In [11]:
# Load the two datasets
fake = pd.read_csv("Fake.csv")
real = pd.read_csv("True.csv")
# Add labels: 0 = Fake, 1 = Real
fake["label"] = 0
real["label"] = 1
# Combine both into one dataset
df = pd.concat([fake, real], axis=0).reset_index(drop=True)
# Combine title + content into a single text column
df["text"] = df["title"] + " " + df["text"]

In [10]:
print(df.head())

                                               title  \
0   Donald Trump Sends Out Embarrassing New Year’...   
1   Drunk Bragging Trump Staffer Started Russian ...   
2   Sheriff David Clarke Becomes An Internet Joke...   
3   Trump Is So Obsessed He Even Has Obama’s Name...   
4   Pope Francis Just Called Out Donald Trump Dur...   

                                                text subject  \
0   Donald Trump Sends Out Embarrassing New Year’...    News   
1   Drunk Bragging Trump Staffer Started Russian ...    News   
2   Sheriff David Clarke Becomes An Internet Joke...    News   
3   Trump Is So Obsessed He Even Has Obama’s Name...    News   
4   Pope Francis Just Called Out Donald Trump Dur...    News   

                date  label                                         clean_text  
0  December 31, 2017      0  donald trump sends embarrassing new year eve m...  
1  December 31, 2017      0  drunk bragging trump staffer started russian c...  
2  December 30, 2017      0  sherif

## Preprocess the Data

In [12]:
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
stop_words = set(stopwords.words('english'))
lemmatizer = WordNetLemmatizer()
def preprocess(text):
    text = text.lower()                           # lowercase
    text = re.sub(r'[^a-z\s]', '', text)          # remove numbers/punctuation
    tokens = text.split()                         # split into words
    tokens = [lemmatizer.lemmatize(w) for w in tokens if w not in stop_words]
    return " ".join(tokens)

# Apply to dataset
df["clean_text"] = df["text"].apply(preprocess)

## Split into Training and Testing

In [13]:
X = df["clean_text"]    # input (news text)
y = df["label"]         # output (0 = Fake, 1 = Real)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

## Convert Text → Numbers (TF-IDF)

In [6]:
vectorizer = TfidfVectorizer(max_features=5000)  # use top 5000 words
X_train_tfidf = vectorizer.fit_transform(X_train)
X_test_tfidf = vectorizer.transform(X_test)

## Train the Models

## Logistic Regression

In [7]:
lr_model = LogisticRegression(max_iter=1000)
lr_model.fit(X_train_tfidf, y_train)
y_pred_lr = lr_model.predict(X_test_tfidf)

print("Logistic Regression Results:")
print("Accuracy:", accuracy_score(y_test, y_pred_lr))
print("F1-score:", f1_score(y_test, y_pred_lr))
print(classification_report(y_test, y_pred_lr, target_names=["Fake", "Real"]))

Logistic Regression Results:
Accuracy: 0.9870824053452116
F1-score: 0.9865116279069768
              precision    recall  f1-score   support

        Fake       0.99      0.98      0.99      4696
        Real       0.98      0.99      0.99      4284

    accuracy                           0.99      8980
   macro avg       0.99      0.99      0.99      8980
weighted avg       0.99      0.99      0.99      8980



## Support Vector Machine (SVM)

In [8]:
svm_model = LinearSVC()
svm_model.fit(X_train_tfidf, y_train)
y_pred_svm = svm_model.predict(X_test_tfidf)

print("SVM Results:")
print("Accuracy:", accuracy_score(y_test, y_pred_svm))
print("F1-score:", f1_score(y_test, y_pred_svm))
print(classification_report(y_test, y_pred_svm, target_names=["Fake", "Real"]))

SVM Results:
Accuracy: 0.9943207126948775
F1-score: 0.9940524781341108
              precision    recall  f1-score   support

        Fake       1.00      0.99      0.99      4696
        Real       0.99      0.99      0.99      4284

    accuracy                           0.99      8980
   macro avg       0.99      0.99      0.99      8980
weighted avg       0.99      0.99      0.99      8980

