Importing the required Librabries

In [7]:
# Data manipulation and cleaning
import pandas as pd
import numpy as np

# Text processing
import string

# Machine learning
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report

# Model
from sklearn.naive_bayes import MultinomialNB  # You can try other models too

# For handling warnings
import warnings
warnings.filterwarnings('ignore')

Loading the Dataset

In [2]:
train_df = pd.read_csv('train.csv')
train_df.head()

Unnamed: 0,id,title,author,text,label
0,0,House Dem Aide: We Didn’t Even See Comey’s Let...,Darrell Lucus,House Dem Aide: We Didn’t Even See Comey’s Let...,1
1,1,"FLYNN: Hillary Clinton, Big Woman on Campus - ...",Daniel J. Flynn,Ever get the feeling your life circles the rou...,0
2,2,Why the Truth Might Get You Fired,Consortiumnews.com,"Why the Truth Might Get You Fired October 29, ...",1
3,3,15 Civilians Killed In Single US Airstrike Hav...,Jessica Purkiss,Videos 15 Civilians Killed In Single US Airstr...,1
4,4,Iranian woman jailed for fictional unpublished...,Howard Portnoy,Print \nAn Iranian woman has been sentenced to...,1


## Data Cleaning

In [10]:
# Checking  for missing valuess
print("Missing Values :\n" , train_df.isnull().sum())
# Checking for duplicate rows
print("Number of Duplicate Rows :" , train_df.duplicated().sum())

Missing Values :
 id                 0
title            558
author          1957
text              39
label              0
cleaned_text       0
dtype: int64
Number of Duplicate Rows : 0


In [12]:
#Droping rows 

train_df.dropna(subset=['text','title','author'] , inplace=True)

#Checking
print("Missing Values :\n" , train_df.isnull().sum())

Missing Values :
 id              0
title           0
author          0
text            0
label           0
cleaned_text    0
dtype: int64


In [13]:
#Droping the columns that are not necessary
train_df.drop(columns=['id','author'],inplace=True)

train_df.head()

Unnamed: 0,title,text,label,cleaned_text
0,House Dem Aide: We Didn’t Even See Comey’s Let...,House Dem Aide: We Didn’t Even See Comey’s Let...,1,house dem aide we didn’t even see comey’s lett...
1,"FLYNN: Hillary Clinton, Big Woman on Campus - ...",Ever get the feeling your life circles the rou...,0,ever get the feeling your life circles the rou...
2,Why the Truth Might Get You Fired,"Why the Truth Might Get You Fired October 29, ...",1,why the truth might get you fired october 29 2...
3,15 Civilians Killed In Single US Airstrike Hav...,Videos 15 Civilians Killed In Single US Airstr...,1,videos 15 civilians killed in single us airstr...
4,Iranian woman jailed for fictional unpublished...,Print \nAn Iranian woman has been sentenced to...,1,print \nan iranian woman has been sentenced to...


## Data Preprocessing

In [6]:
# Function for basic text preprocessing
def preprocess_text_simple(text):
    text = str(text)
    #Converrting text to lowecase
    text = text.lower()
    #Removing punctuation
    text = ''.join([char for char in text if char not in string.punctuation])

    return text

train_df['cleaned_text'] = train_df['text'].apply(preprocess_text_simple)

print(train_df[['text','cleaned_text']].head())

                                                text  \
0  House Dem Aide: We Didn’t Even See Comey’s Let...   
1  Ever get the feeling your life circles the rou...   
2  Why the Truth Might Get You Fired October 29, ...   
3  Videos 15 Civilians Killed In Single US Airstr...   
4  Print \nAn Iranian woman has been sentenced to...   

                                        cleaned_text  
0  house dem aide we didn’t even see comey’s lett...  
1  ever get the feeling your life circles the rou...  
2  why the truth might get you fired october 29 2...  
3  videos 15 civilians killed in single us airstr...  
4  print \nan iranian woman has been sentenced to...  


## Vectorization (Transform the text data into numerical features using TF-IDF):

In [20]:
# Again importing to show which library we use 
from sklearn.feature_extraction.text import TfidfVectorizer

# Initializzing the TF-IDF Vectorizer
vectorizer = TfidfVectorizer(max_features=5000)  # Limiting to 5000 for efficiency

# Fitting and transforming the clean data
X= vectorizer.fit_transform(train_df['cleaned_text'])

# Printing the shape of the transformed data (Number of samples, Number of features)
print(f"Shape of transformed data: {X.shape}")

Shape of transformed data: (18285, 5000)


Defining The Target Variable

In [21]:
y = train_df['label']

## Training The Model

We'll train a simple Logistic Regression model to classify the news as fake (0) or real (1).

In [22]:
#Immpoting neecessary libraries
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report

In [24]:
# Splitting the data into training and testing sets (80% training , 20% testing)
X_train, X_test , y_train , y_test = train_test_split(X , y ,  test_size=0.2 , random_state=42)

#Initializing the Logisstic Regression Model
model = LogisticRegression()

#Training The Model
model.fit(X_train,y_train)

In [25]:
#Making Predictions on the test set
y_pred = model.predict(X_test)

#Evaluating The Model
print(classification_report(y_test,y_pred))

              precision    recall  f1-score   support

           0       0.94      0.96      0.95      2082
           1       0.94      0.93      0.93      1575

    accuracy                           0.94      3657
   macro avg       0.94      0.94      0.94      3657
weighted avg       0.94      0.94      0.94      3657



## Saving the Model and Vectorizer

In [27]:
#Required Library
import joblib

# Saviing the model
joblib.dump(model, 'model.pkl')

# Saving the vectorizer
joblib.dump(vectorizer, 'vectorizer.pkl')

print("Model and vectorizer saved successfully!")


Model and vectorizer saved successfully!


## Loading and Testing the Saved Model

In [33]:
import joblib

#Loading the saved model and vectorizer
loaded_model = joblib.load('model.pkl')
loaded_vectorizer = joblib.load('vectorizer.pkl')

#Sample Text for Prediction
new_text = ["The economy is improving rapidly, says expert report"]
new_text_vectorized = loaded_vectorizer.transform(new_text)

#Prediction Using the Loaded Model
prediction = loaded_model.predict(new_text_vectorized)

#Displaying The Result 
print(f"Prediction: {prediction[0]}")  # 0 for Real, 1 for Fake


Prediction: 1
