In [88]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [89]:
import numpy as np
import pandas as pd
from sklearn import naive_bayes
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report
import spacy
import joblib

# Loading the dataset first into pandas dataframe

In [90]:
# # reading the csv file as dataframe
# news_df = pd.read_csv("/content/drive/MyDrive/Fake News Model 2/fake_and_real_news.csv")
# news_df

# Applying One Hot Encoding (OHE) to label column in the dataframe

In [91]:
# # getting the dummy column
# real_label = pd.get_dummies(news_df['label'], dtype = int, drop_first = True)

# # dropping the label column in dataframe
# news_df.drop(columns = ["label"], inplace = True)

# # concating real column with the news dataframe
# news_df = pd.concat([news_df, real_label], axis = "columns")
# news_df

# Checking for null values

In [92]:
# news_df.isna().sum()

# Checking for duplicate values

In [93]:
# # removing duplicate values from the dataframe
# news_df.drop_duplicates(keep = 'first', inplace = True)

# # resetting indices of dataframe
# news_df.reset_index(drop = True, inplace = True)
# news_df

# Checking the proportion of real and fake data

In [94]:
# print(f"Proportion of fake and real news data:\n\n{news_df['Real'].value_counts(normalize = True)}")

# Applying NLP into the dataframe

In [95]:
# loading pre trained english language model
nlp = spacy.load("en_core_web_sm")

# defining a function to applying nlp
def nlp_apply(text : str) -> str:
  # converting into lower case
  text = text.lower()

  # converting into document
  doc = nlp(text)

  # getting refined words using list comprehension
  refined_words = [token.lemma_ for token in doc if not (token.is_stop) and not (token.is_punct)]

  # returning as a string
  return " ".join(refined_words)

In [96]:
# # applying nlp into dataframe
# news_df["Text"] = news_df["Text"].apply(nlp_apply)

# # saving into csv file
# news_df.to_csv("/content/drive/MyDrive/Fake News Model 2/Processed_news.csv", index = False)

# news_df

# Loading the processed data

In [97]:
# loading the processed data
processed_data = pd.read_csv("/content/drive/MyDrive/Fake News Model 2/Processed_news.csv")

processed_data

Unnamed: 0,Text,Real
0,trump surrogate brutally stab pathetic video...,0
1,u.s conservative leader optimistic common grou...,1
2,trump propose u.s tax overhaul stir concern de...,1
3,court force ohio allow million illegally pur...,0
4,democrats trump agree work immigration bill wa...,1
...,...,...
9860,wikileak admit screw immensely twitter poll ...,0
9861,trump consult republican senator fed chief can...,1
9862,trump lawyer judge lack jurisdiction defamatio...,1
9863,watch right wing pastor falsely credit trump...,0


# Checking the proportion of data

- 0 - Fake News
- 1 - Real News

In [98]:
processed_data["Real"].value_counts()

Unnamed: 0_level_0,count
Real,Unnamed: 1_level_1
0,5000
1,4865


# Applying 80-20 split

In [99]:
# getting the feature
feature = processed_data["Text"]

# getting the label
label = processed_data["Real"]

"""using train_test_split() function to get training and testing data by
using 80-20 split. Giving label to stratify for good proportion of data,
while keeping test_size 20% and keeping random_state as 42 for a good
proportion."""
x_train, x_test, y_train, y_test = train_test_split(feature, label,
                                                    test_size = 0.2,
                                                    stratify = label,
                                                    random_state = 42)

# Checking the proportion ratio of training and testing data

In [100]:
# checking for training data
print(f"Proportion of training data:\n\n{y_train.value_counts(normalize = True)}")

print("\n----------------------------------------------------------------------\n")

# checking for testing data
print(f"Proportion of testing data:\n\n{y_test.value_counts(normalize = True)}")

Proportion of training data:

Real
0    0.506842
1    0.493158
Name: proportion, dtype: float64

----------------------------------------------------------------------

Proportion of testing data:

Real
0    0.506842
1    0.493158
Name: proportion, dtype: float64


# Applying TFIDF vectorizer to training data

In [101]:
# # creating instance of TfidfVectorizer class
# vector = TfidfVectorizer(ngram_range=(1,2),
#                          max_features=25000)

# # saving the vector into file
# joblib.dump(vector, "/content/drive/MyDrive/Fake News Model 2/vector.pkl")

# loading back the vector from file
vector = joblib.load("/content/drive/MyDrive/Fake News Model 2/vector.pkl")

# applying vectorization into training data
x_train_tfidf_form = vector.fit_transform(x_train)

# applying vectozation into testing data
x_test_tfidf_form = vector.transform(x_test)

# Training with naive bayes model

In [102]:
naive_bayes_model = naive_bayes.MultinomialNB(alpha = 0.5) # Smoothing to prevent overconfidence

# training the model
naive_bayes_model.fit(x_train_tfidf_form, y_train)

# Getting the accuracy of Naive Bayes model

In [103]:
# getting predicted values
naive_bayes_pred = naive_bayes_model.predict(x_test_tfidf_form)

# getting accuracy score
print(f"Accuracy of Naive Bayes model: {accuracy_score(y_test, naive_bayes_pred) * 100:0.2f}%")
# getting classification report
print(f"\nClassification Report of Naive Bayes model:\n\n{classification_report(y_test, naive_bayes_pred)}")

Accuracy of Naive Bayes model: 97.82%

Classification Report of Naive Bayes model:

              precision    recall  f1-score   support

           0       0.97      0.98      0.98      1000
           1       0.98      0.97      0.98       973

    accuracy                           0.98      1973
   macro avg       0.98      0.98      0.98      1973
weighted avg       0.98      0.98      0.98      1973



# Training with Random Forest classifier

In [104]:
from sklearn import ensemble

random_forest_model = ensemble.RandomForestClassifier(n_estimators = 100,
                                                      random_state = 42)

# training the model
random_forest_model.fit(x_train_tfidf_form, y_train)

# Checking the accuracy of random forest

In [105]:
random_forest_pred = random_forest_model.predict(x_test_tfidf_form)

print(f"Accuracy of Random Forest model: {accuracy_score(y_test, random_forest_pred) * 100:0.2f}%")
print(f"\nClassification Report of Random Forest model:\n\n{classification_report(y_test, random_forest_pred)}")

Accuracy of Random Forest model: 99.95%

Classification Report of Random Forest model:

              precision    recall  f1-score   support

           0       1.00      1.00      1.00      1000
           1       1.00      1.00      1.00       973

    accuracy                           1.00      1973
   macro avg       1.00      1.00      1.00      1973
weighted avg       1.00      1.00      1.00      1973



# Training with Decision Trees

In [106]:
from sklearn import tree

decision_tree_model = tree.DecisionTreeClassifier(random_state = 42)

# training the model
decision_tree_model.fit(x_train_tfidf_form, y_train)

# Getting accuracy of decision trees algorithm

In [107]:
decision_tree_pred = decision_tree_model.predict(x_test_tfidf_form)

print(f"Accuracy of Decision Tree model: {accuracy_score(y_test, decision_tree_pred) * 100:0.2f}%")
print(f"\nClassification Report of Decision Tree model:\n\n{classification_report(y_test, decision_tree_pred)}")

Accuracy of Decision Tree model: 99.49%

Classification Report of Decision Tree model:

              precision    recall  f1-score   support

           0       1.00      0.99      0.99      1000
           1       0.99      1.00      0.99       973

    accuracy                           0.99      1973
   macro avg       0.99      0.99      0.99      1973
weighted avg       0.99      0.99      0.99      1973



# Training with Support Vector Machine (SVM)

In [108]:
from sklearn import svm

svm_model = svm.SVC(C=0.3, kernel='linear',
                    class_weight='balanced')  # More regularization

# training the model
svm_model.fit(x_train_tfidf_form, y_train)

# Getting the accuracy of SVM

In [109]:
svm_pred = svm_model.predict(x_test_tfidf_form)

print(f"Accuracy of SVM model: {accuracy_score(y_test, svm_pred) * 100:0.2f}%")
print(f"\nClassification Report of SVM model:\n\n{classification_report(y_test, svm_pred)}")

Accuracy of SVM model: 99.80%

Classification Report of SVM model:

              precision    recall  f1-score   support

           0       1.00      1.00      1.00      1000
           1       1.00      1.00      1.00       973

    accuracy                           1.00      1973
   macro avg       1.00      1.00      1.00      1973
weighted avg       1.00      1.00      1.00      1973



# Training with Logistic Regression

In [110]:
from sklearn import linear_model

logistic_model = linear_model.LogisticRegression(random_state = 42)

# training the model
logistic_model.fit(x_train_tfidf_form, y_train)

# Getting the accuracy of Logistic Regression

In [111]:
logistic_pred = logistic_model.predict(x_test_tfidf_form)

print(f"Accuracy of Logistic Regression model: {accuracy_score(y_test, logistic_pred) * 100:0.2f}%")
print(f"\nClassification Report of Logistic Regression model:\n\n{classification_report(y_test, logistic_pred)}")

Accuracy of Logistic Regression model: 99.39%

Classification Report of Logistic Regression model:

              precision    recall  f1-score   support

           0       0.99      0.99      0.99      1000
           1       0.99      0.99      0.99       973

    accuracy                           0.99      1973
   macro avg       0.99      0.99      0.99      1973
weighted avg       0.99      0.99      0.99      1973



# All model combination

In [112]:
# from sklearn.ensemble import VotingClassifier
# from sklearn.naive_bayes import MultinomialNB
# from sklearn.svm import SVC
# from sklearn.ensemble import RandomForestClassifier
# from sklearn.tree import DecisionTreeClassifier
# from sklearn.linear_model import LogisticRegression

# # Define all models
# nb = MultinomialNB(alpha=0.8)  # Naïve Bayes (fixed smoothing)
# rf = RandomForestClassifier(n_estimators=100, max_depth=6, class_weight="balanced", random_state=42)  # RF (less Fake bias)
# svm = SVC(C=0.3, kernel="linear", probability=True, class_weight="balanced")  # SVM (generalization)
# dt = DecisionTreeClassifier(max_depth=6, min_samples_split=15, random_state=42)  # Decision Tree (less Fake bias)
# lr = LogisticRegression(max_iter=500, class_weight="balanced", random_state=42)  # Logistic Regression (fix balance)

# # Create VotingClassifier with Soft Voting
# all_ensemble_model = VotingClassifier(
#     estimators=[("nb", nb), ("rf", rf), ("svm", svm), ("dt", dt), ("lr", lr)],
#     voting="soft",  # Use probabilities for better performance
#     weights=[5, 1, 2, 1, 1]  # Give more weight to Naïve Bayes
# )

# # Train the model
# all_ensemble_model.fit(x_train_tfidf_form, y_train)

# Loading the combined model from file

In [113]:
import joblib

# saving the combined model into file
# joblib.dump(all_ensemble_model, "/content/drive/MyDrive/Fake News Model 2/all_ensemble_model")

# loading the combined model
all_ensemble_model = joblib.load("/content/drive/MyDrive/Fake News Model 2/all_ensemble_model")

# Accuracy of combined model

In [114]:
all_ensemble_pred = all_ensemble_model.predict(x_test_tfidf_form)

print(f"Accuracy of All Ensemble model: {accuracy_score(y_test, all_ensemble_pred) * 100:0.2f}%")
print(f"\nClassification Report of All Ensemble model:\n\n{classification_report(y_test, all_ensemble_pred)}")

Accuracy of All Ensemble model: 99.24%

Classification Report of All Ensemble model:

              precision    recall  f1-score   support

           0       0.99      0.99      0.99      1000
           1       0.99      0.99      0.99       973

    accuracy                           0.99      1973
   macro avg       0.99      0.99      0.99      1973
weighted avg       0.99      0.99      0.99      1973



# Testing the performance of models by predicting real data

In [115]:
# dictionary of models with names
models = {naive_bayes_model : "Naive Bayes",
          random_forest_model : "Random Forest",
          svm_model : "SVM",
          decision_tree_model : "Decision Tree",
          logistic_model : "Logistic Regression",
          all_ensemble_model : "All Ensemble",
          }

# defining the function to perform required processing to real data only
def predict_news(text : str) -> None:
    # testing if any error
    try:
        # calling the nlp function
        text = np.array([nlp_apply(text)])

        # applying tfidf vectorization
        text_tfidf_form = vector.transform(text)

        # getting the models using loop
        for model in models.keys():
          # giving model for prediction
          output = model.predict(text_tfidf_form)

          # printing the results
          print(f"{models[model]} Model Prediction: {'Real' if (output[0]) == 1 else 'Fake'}")

    # catching error here
    except Exception as e:
        print(f"Exception occured at test_news() function: {e}")

# Testing now using real data

In [116]:
# Original news dictionary for testing models
original_news = {
    # REAL NEWS
    """\"Scientists Develop AI That Can Detect Cancer in Early Stages\"
    Researchers have successfully developed an AI model capable of detecting cancer in its early stages with **98% accuracy**. The model is expected to improve survival rates and reduce misdiagnosis cases.""" : "Real",

    """\"Microsoft Announces Quantum Computing Breakthrough\"
    Microsoft researchers have announced a major breakthrough in **quantum computing**, claiming they have developed a stable **quantum bit (qubit)** that will revolutionize data encryption and complex problem-solving.""" : "Real",

    """\"WHO Reports Major Decline in Global COVID-19 Cases\"
    The World Health Organization (WHO) has reported a **65% drop** in global COVID-19 cases over the past three months, attributing it to higher vaccination rates and improved treatment protocols.""" : "Real",

    """\"Japan Plans to Build First Space Elevator by 2050\"
    A leading Japanese aerospace company has unveiled plans to construct the world’s **first space elevator** by **2050**, reducing the cost of space travel and enabling a new era of **low-orbit tourism**.""" : "Real",

    """\"Google Unveils AI-Powered Search Engine Update\"
    Google has announced an **AI-powered upgrade** to its search engine, enabling **conversational responses** and better understanding of user queries. The update aims to provide **more accurate and human-like results**.""" : "Real",

    """\"Electric Cars Outnumber Gas Cars for First Time in Norway\"
    Norway has officially become the first country where **electric vehicles (EVs) outnumber gasoline-powered cars**. Experts predict that Norway will become **100% gas-free by 2028**.""" : "Real",

    """\"Breakthrough in Alzheimer's Research Brings New Hope\"
    Scientists have discovered a **new protein therapy** that can significantly **slow down Alzheimer’s disease**. Clinical trials show that patients experienced **improved memory retention** over six months.""" : "Real",

    """\"China Successfully Conducts First Manned Mission to Mars\"
    China’s space agency has successfully landed **three astronauts on Mars**, marking the first **human mission to the Red Planet**. The mission aims to establish a **permanent research base** on Mars by 2035.""" : "Real",

    # FAKE NEWS
    """\"NASA Confirms That the Earth Is Actually Flat\"
    A leaked NASA document has revealed that the space agency has been **hiding the truth for decades**, and **the Earth is actually flat**. Scientists worldwide are shocked by this revelation.""" : "Fake",

    """\"Elon Musk Reveals Plan to Move Tesla Headquarters to the Moon\"
    In a shocking announcement, **Elon Musk** has revealed plans to **relocate Tesla’s headquarters to a lunar base** by **2030**. The company aims to build the **first-ever car manufacturing plant on the Moon**.""" : "Fake",

    """\"Scientists Successfully Reverse Aging in Humans\"
    A team of genetic engineers has discovered a **miracle treatment** that can **reverse aging**, effectively making **humans immortal**. Clinical trials show that test subjects **became biologically 20 years younger**.""" : "Fake",

    """\"Facebook to Introduce ‘Mind-Reading’ Feature for Status Updates\"
    Facebook has announced a new AI-powered **mind-reading technology** that will allow users to **post status updates directly from their thoughts**. The feature will be launched next year.""" : "Fake",

    """\"Secret Underground Alien Civilization Discovered in Antarctica\"
    A group of explorers has reportedly uncovered an **underground alien city** in Antarctica, believed to be **thousands of years old**. According to the leaked reports, the site contains **advanced extraterrestrial technology**.""" : "Fake",

    """\"Apple iPhone 16 Will Feature a Built-in Hologram Projector\"
    Apple has confirmed that its next iPhone model will include a **hologram projector**, allowing users to **watch 3D videos in mid-air** without any additional accessories.""" : "Fake",

    """\"Government to Ban the Internet for 24 Hours Every Week\"
    A new bill proposed in Congress suggests that **the internet should be shut down for 24 hours every Sunday** to encourage people to spend more time with family. Tech companies are **outraged** by the proposal.""" : "Fake",

    """\"Scientists Discover Atlantis at the Bottom of the Pacific Ocean\"
    Marine researchers claim to have **found the lost city of Atlantis** deep beneath the Pacific Ocean. The discovery could **rewrite history** and prove that the **legend of Atlantis was real all along**.""" : "Fake",

    "Mamata Banerjee died yesterday night" : "Fake",

    "Mamata Banerjee is the Chief Minister of West Bengal" : "Real",

    "Narendra Modi become the President of US" : "Fake"
}


# predicting all news from original data
for index, news in enumerate(original_news.keys()):
  print(f"News No. {index + 1}\n")
  print(f"Original label: {original_news[news]}\n")
  print(f"News: {news}\n")
  predict_news(news)
  print("\n-----------------------------------------------------------------------------------\n")

News No. 1

Original label: Real

News: "Scientists Develop AI That Can Detect Cancer in Early Stages"
    Researchers have successfully developed an AI model capable of detecting cancer in its early stages with **98% accuracy**. The model is expected to improve survival rates and reduce misdiagnosis cases.

Naive Bayes Model Prediction: Real
Random Forest Model Prediction: Real
SVM Model Prediction: Real
Decision Tree Model Prediction: Real
Logistic Regression Model Prediction: Real
All Ensemble Model Prediction: Real

-----------------------------------------------------------------------------------

News No. 2

Original label: Real

News: "Microsoft Announces Quantum Computing Breakthrough"
    Microsoft researchers have announced a major breakthrough in **quantum computing**, claiming they have developed a stable **quantum bit (qubit)** that will revolutionize data encryption and complex problem-solving.

Naive Bayes Model Prediction: Real
Random Forest Model Prediction: Real
SVM 