In [1]:
import pandas as pd
import numpy as np
# If you want to visualize
import matplotlib.pyplot as plt
import seaborn as sns

In [3]:
from google.colab import drive
drive.mount('/content/drive')

# Read using file path
file_path = '/content/archive (4).zip'
df = pd.read_csv(file_path)

Mounted at /content/drive


In [4]:
print(df.head()) # View the first 5 rows
print(df.info()) # View the data types, number of non-null values
print(df.describe()) # View the statistical summary (for numerical columns only)

                                                Text label
0   Top Trump Surrogate BRUTALLY Stabs Him In The...  Fake
1  U.S. conservative leader optimistic of common ...  Real
2  Trump proposes U.S. tax overhaul, stirs concer...  Real
3   Court Forces Ohio To Allow Millions Of Illega...  Fake
4  Democrats say Trump agrees to work on immigrat...  Real
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 9900 entries, 0 to 9899
Data columns (total 2 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   Text    9900 non-null   object
 1   label   9900 non-null   object
dtypes: object(2)
memory usage: 154.8+ KB
None
                                                     Text label
count                                                9900  9900
unique                                               9865     2
top     Highlights: The Trump presidency on April 13 a...  Fake
freq                                                    8  5000


In [5]:
print(df.isnull().sum())

Text     0
label    0
dtype: int64


In [6]:
df.dropna(inplace=True)

In [14]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score

# --- (A) Load the File ---
# 'files.upload()' is used to load a local file into Colab,
# but since you've already uploaded the file, we'll use its name directly.

file_path = "/content/archive (4).zip"
try:
  df = pd.read_csv(file_path)
  print("Dataset successfully loaded.")
except FileNotFoundError:
  print(f"Error: File not found at {file_path}. Please check the file name.")
# If the file does not load, stop before running further code.
# You can use the following code for manual upload:
# from google.colab import files
# uploaded = files.upload()
# import io
# df = pd.read_csv(io.BytesIO(uploaded['fake_and_real_news.csv']))

# --- (B) Check the data ---
print("\nFirst 5 rows of the dataset:")
print(df.head())

print("\nColumn information and Missing Values ​​check:")
df.info()

# --- (C) Label Encoding (Target Column) ---
# The machine learning model will need to convert the 'Fake' and 'Real' labels to 0 and 1.
df['label'] = df['label'].map({'Fake': 0, 'Real': 1})

print("\nLabel Distribution (0=Fake, 1=Real):")
print(df['label'].value_counts())

Dataset successfully loaded.

First 5 rows of the dataset:
                                                Text label
0   Top Trump Surrogate BRUTALLY Stabs Him In The...  Fake
1  U.S. conservative leader optimistic of common ...  Real
2  Trump proposes U.S. tax overhaul, stirs concer...  Real
3   Court Forces Ohio To Allow Millions Of Illega...  Fake
4  Democrats say Trump agrees to work on immigrat...  Real

Column information and Missing Values ​​check:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 9900 entries, 0 to 9899
Data columns (total 2 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   Text    9900 non-null   object
 1   label   9900 non-null   object
dtypes: object(2)
memory usage: 154.8+ KB

Label Distribution (0=Fake, 1=Real):
label
0    5000
1    4900
Name: count, dtype: int64


In [16]:
# --- Corrected Import ---
import nltk

# Download NLTK packages (Run this once)
print("Downloading necessary NLTK components...")
nltk.download('stopwords')
print("NLTK downloads complete.")

# You can now proceed to the Text Preprocessing code block.

Downloading necessary NLTK components...
NLTK downloads complete.


[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


In [18]:
import re
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer

# NLTK Setup
porter = PorterStemmer()
stop_words = set(stopwords.words('english'))

def preprocess_text(text):
    """Function to clean the text and perform stemming."""
    # Handle NaN values: If the input is not a string, convert it to an empty string
    # to prevent further errors during regex/lower-casing.
    if pd.isna(text):
        text = ""

    # 1. Remove non-alphabetic characters and numbers
    text = re.sub('[^a-zA-Z]', ' ', text)
    # 2. Convert to lowercase
    text = text.lower()
    # 3. Tokenize (split into words)
    words = text.split()
    # 4. Remove Stop Words and Stemming
    words = [porter.stem(word) for word in words if word not in stop_words]
    # 5. Join words back into a single string
    return ' '.join(words)

# --- (A) Apply Preprocessing ---
print("\nApplying text preprocessing...")

# --- THE FIX IS HERE: Changed 'text' to 'Text' ---
df['clean_text'] = df['Text'].apply(preprocess_text)
# --------------------------------------------------

print("Preprocessing Complete.")

# --- (B) Separate Features (X) and Target (y) ---
X = df['clean_text'] # The cleaned text
y = df['label']      # The binary label

# --- (C) Split Data into Train and Test Sets ---
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

# --- (D) TF-IDF Vectorizer ---
# Convert text documents to a matrix of TF-IDF features
tfidf_vectorizer = TfidfVectorizer(max_features=5000)
X_train_tfidf = tfidf_vectorizer.fit_transform(X_train)
X_test_tfidf = tfidf_vectorizer.transform(X_test)

print("\nText Vectorization Complete.")
print(f"Training Features shape: {X_train_tfidf.shape}")
print(f"Testing Features shape: {X_test_tfidf.shape}")


Applying text preprocessing...
Preprocessing Complete.

Text Vectorization Complete.
Training Features shape: (7920, 5000)
Testing Features shape: (1980, 5000)


In [19]:
from sklearn.metrics import classification_report

# --- (A) Train the Model ---
model = LogisticRegression(solver='liblinear', random_state=42)
print("\nTraining the Logistic Regression Model...")
model.fit(X_train_tfidf, y_train)
print("Model Training Complete.")

# --- (B) Predict on the Test Set ---
y_pred = model.predict(X_test_tfidf)

# --- (C) Evaluate the Model ---
accuracy = accuracy_score(y_test, y_pred)

print("\n--- Model Evaluation Results ---")
print(f"Accuracy Score: {accuracy * 100:.2f}%")

print("\nClassification Report (Detailed Metrics):")
# The report shows Precision, Recall, and F1-score for both Fake (0) and Real (1) classes.
print(classification_report(y_test, y_pred, target_names=['Fake (0)', 'Real (1)']))


Training the Logistic Regression Model...
Model Training Complete.

--- Model Evaluation Results ---
Accuracy Score: 98.94%

Classification Report (Detailed Metrics):
              precision    recall  f1-score   support

    Fake (0)       0.99      0.99      0.99      1000
    Real (1)       0.99      0.99      0.99       980

    accuracy                           0.99      1980
   macro avg       0.99      0.99      0.99      1980
weighted avg       0.99      0.99      0.99      1980



In [20]:
from sklearn.svm import LinearSVC
from sklearn.metrics import classification_report

# 1. Linear Support Vector Classifier (SVC) को Train करें
svc_model = LinearSVC(random_state=42)
print("\nTraining the Linear SVC Model...")
svc_model.fit(X_train_tfidf, y_train)

# 2. Prediction करें
svc_pred = svc_model.predict(X_test_tfidf)

# 3. मूल्यांकन करें
print("\n--- Linear SVC Model Evaluation Results ---")
print(f"Accuracy Score: {accuracy_score(y_test, svc_pred) * 100:.2f}%")
print("\nClassification Report:")
print(classification_report(y_test, svc_pred, target_names=['Fake (0)', 'Real (1)']))


Training the Linear SVC Model...

--- Linear SVC Model Evaluation Results ---
Accuracy Score: 99.75%

Classification Report:
              precision    recall  f1-score   support

    Fake (0)       1.00      1.00      1.00      1000
    Real (1)       1.00      1.00      1.00       980

    accuracy                           1.00      1980
   macro avg       1.00      1.00      1.00      1980
weighted avg       1.00      1.00      1.00      1980



In [21]:
from sklearn.model_selection import GridSearchCV

# Define the range of parameters to test
param_grid = {'C': [0.1, 1, 10, 100]} # C is the inverse of regularization strength

# Set up GridSearchCV to test parameters with 5-fold cross-validation
grid_search = GridSearchCV(LogisticRegression(solver='liblinear', random_state=42),
                           param_grid, cv=5, scoring='accuracy', n_jobs=-1)

# Start training/searching
print("\nStarting Hyperparameter Tuning...")
grid_search.fit(X_train_tfidf, y_train)

# View the best parameters and score
print("\n--- Hyperparameter Tuning Results ---")
print("Best Parameters found:", grid_search.best_params_)
print("Best Cross-Validation Score:", grid_search.best_score_)

# Use the best model to predict on the test set
best_model = grid_search.best_estimator_
tuned_pred = best_model.predict(X_test_tfidf)

print("\nTuned Model Classification Report:")
print(classification_report(y_test, tuned_pred, target_names=['Fake (0)', 'Real (1)']))


Starting Hyperparameter Tuning...

--- Hyperparameter Tuning Results ---
Best Parameters found: {'C': 100}
Best Cross-Validation Score: 0.9964646464646464

Tuned Model Classification Report:
              precision    recall  f1-score   support

    Fake (0)       1.00      1.00      1.00      1000
    Real (1)       1.00      1.00      1.00       980

    accuracy                           1.00      1980
   macro avg       1.00      1.00      1.00      1980
weighted avg       1.00      1.00      1.00      1980



In [22]:
# Assuming 'best_model' from the tuning step (or the SVC/LogisticRegression model) is your final choice.
# We will reuse the 'preprocess_text' function and 'tfidf_vectorizer' from previous steps.

# 1. Define a new article
new_article = ["The president announced today a major overhaul of the nation's healthcare system, receiving mixed reviews from congress."]

# 2. Preprocess and Vectorize the new article
# Ensure you use the SAME preprocess_text function and the SAME fitted tfidf_vectorizer
new_article_clean = [preprocess_text(text) for text in new_article]
new_article_vectorized = tfidf_vectorizer.transform(new_article_clean)

# 3. Make the final prediction
final_prediction = best_model.predict(new_article_vectorized) # Using the 'best_model' from tuning

# 4. Display the result
if final_prediction[0] == 1:
    print("\nPrediction: REAL News (1)")
else:
    print("\nPrediction: FAKE News (0)")


Prediction: REAL News (1)


In [23]:
from sklearn.svm import LinearSVC
from sklearn.metrics import classification_report, accuracy_score

# Assuming X_train_tfidf, X_test_tfidf, y_train, and y_test are already defined
svc_model = LinearSVC(random_state=42)
print("\nTraining the Linear SVC Model...")
svc_model.fit(X_train_tfidf, y_train)

# Predict and Evaluate
svc_pred = svc_model.predict(X_test_tfidf)

print("\n--- Linear SVC Model Evaluation Results ---")
print(f"Accuracy Score: {accuracy_score(y_test, svc_pred) * 100:.2f}%")
print("\nClassification Report:")
print(classification_report(y_test, svc_pred, target_names=['Fake (0)', 'Real (1)']))


Training the Linear SVC Model...

--- Linear SVC Model Evaluation Results ---
Accuracy Score: 99.75%

Classification Report:
              precision    recall  f1-score   support

    Fake (0)       1.00      1.00      1.00      1000
    Real (1)       1.00      1.00      1.00       980

    accuracy                           1.00      1980
   macro avg       1.00      1.00      1.00      1980
weighted avg       1.00      1.00      1.00      1980



In [24]:
from sklearn.model_selection import GridSearchCV
# We will choose the LinearSVC model since it often outperforms Logistic Regression for this type of data.

# Define the model and the parameters to search
svc_to_tune = LinearSVC(random_state=42, dual=False) # dual=False is efficient for large samples
param_grid = {'C': [0.5, 1.0, 5.0]} # Test different regularization strengths

# Set up GridSearchCV to test parameters using 3-fold cross-validation
grid_search = GridSearchCV(svc_to_tune,
                           param_grid, cv=3, scoring='accuracy', n_jobs=-1)

# Start training/searching on your training data
print("\nStarting Hyperparameter Tuning for Linear SVC...")
grid_search.fit(X_train_tfidf, y_train)

# --- Best Model Output ---
print("\nBest Parameters found:", grid_search.best_params_)

best_svc_model = grid_search.best_estimator_
tuned_pred = best_svc_model.predict(X_test_tfidf)

print("\n--- Tuned Linear SVC Model Final Report ---")
print(f"Accuracy Score: {accuracy_score(y_test, tuned_pred) * 100:.2f}%")
print(classification_report(y_test, tuned_pred, target_names=['Fake (0)', 'Real (1)']))


Starting Hyperparameter Tuning for Linear SVC...

Best Parameters found: {'C': 5.0}

--- Tuned Linear SVC Model Final Report ---
Accuracy Score: 99.75%
              precision    recall  f1-score   support

    Fake (0)       1.00      1.00      1.00      1000
    Real (1)       1.00      1.00      1.00       980

    accuracy                           1.00      1980
   macro avg       1.00      1.00      1.00      1980
weighted avg       1.00      1.00      1.00      1980



In [25]:
# Reusing the 'preprocess_text' function and 'tfidf_vectorizer' from previous steps.

# Define a new article to classify
new_article = [
    "Exclusive: New secret talks reveal a major trade agreement is expected to be finalized next month, sources confirm.",
    "Billionaire secretly funds asteroid defense shield using alien technology, claims leaked documents."
]

# 1. Preprocess and Vectorize the new article(s)
new_article_clean = [preprocess_text(text) for text in new_article]
# IMPORTANT: Use .transform(), NOT .fit_transform()
new_article_vectorized = tfidf_vectorizer.transform(new_article_clean)

# 2. Make the final prediction using your best model
final_prediction = best_svc_model.predict(new_article_vectorized)

# 3. Display the result
print("\n--- New Article Classification ---")
for i, pred in enumerate(final_prediction):
    label = "REAL News (1)" if pred == 1 else "FAKE News (0)"
    print(f"Article {i+1} Prediction: {label}")


--- New Article Classification ---
Article 1 Prediction: REAL News (1)
Article 2 Prediction: FAKE News (0)


In [26]:
# Assuming 'best_svc_model' is the best model object from your tuning step

# Define some new articles (one expected to be real, one fake)
new_articles = [
    "The World Health Organization confirmed an outbreak of a severe seasonal flu across several continents.",
    "A secret memo reveals that squirrels are planning to seize control of the global nut supply next Tuesday."
]

# Reusing your existing preprocessing tools
# ----------------------------------------------------
# 1. Preprocess and Vectorize the new article(s)
new_article_clean = [preprocess_text(text) for text in new_articles]
new_article_vectorized = tfidf_vectorizer.transform(new_article_clean) # Use .transform()

# 2. Make the final prediction
final_prediction = best_svc_model.predict(new_article_vectorized)

# 3. Display the result
print("\n--- Final Model Classification Test ---")
for i, pred in enumerate(final_prediction):
    label = "REAL News (1)" if pred == 1 else "FAKE News (0)"
    print(f"Article {i+1} Prediction: {label}")


--- Final Model Classification Test ---
Article 1 Prediction: FAKE News (0)
Article 2 Prediction: REAL News (1)


In [31]:
import pickle
import os
from google.colab import drive

# 1. Mount the drive (if it's already mounted, this line will be skipped)
drive.mount('/content/drive')

# 2. Define the save path
save_path = '/content/drive/MyDrive/Fake_News_Model_Files/'

# 3. Create the folder from the code (if it doesn't exist)
# The os.makedirs() function creates the directory, and 'exists_ok=True' guarantees that if the folder already exists, an error will not occur.
os.makedirs(save_path, exist_ok=True)
print(f"Directory created or verified at: {save_path}")

# 4. Save the model
# Assuming 'best_svc_model' and 'tfidf_vectorizer' are defined in previous cells

# save the model
with open(save_path + 'best_svc_model.pkl', 'wb') as file:
  pickle.dump(best_svc_model, file)
print("Model saved successfully: best_svc_model.pkl")

# save the vectorizer
with open(save_path + 'tfidf_vectorizer.pkl', 'wb') as file:
  pickle.dump(tfidf_vectorizer, file)
print("Vectorizer saved successfully: tfidf_vectorizer.pkl")

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).
Directory created or verified at: /content/drive/MyDrive/Fake_News_Model_Files/
Model saved successfully: best_svc_model.pkl
Vectorizer saved successfully: tfidf_vectorizer.pkl
