In [10]:
### Python Packages
#This cell imports all the necessary Python libraries for data processing, visualization, 
#natural language processing (NLP), and machine learning. 
#It ensures all dependencies are loaded at the start for seamless notebook execution.

In [11]:
# Used Packages
import pandas as pd         # For data manipulation and handling
import numpy as np          # For numerical computations
import matplotlib.pyplot as plt  # For visualizations
import seaborn as sns       # For enhanced visualizations
from sklearn.model_selection import train_test_split  # For splitting datasets
from sklearn.feature_extraction.text import TfidfVectorizer  # For text vectorization
from sklearn.linear_model import LogisticRegression  # For training the ML model
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score  # For evaluation
import nltk                 # For natural language processing (NLP)

# Download necessary NLTK data
nltk.download('stopwords')


[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/pouyasmac/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [17]:
# Load datasets
fake_df = pd.read_csv('data/fake.csv')  # Replace 'data/' with the path to your fake.csv file
true_df = pd.read_csv('data/true.csv')  # Replace 'data/' with the path to your true.csv file

# Display the first few rows of each dataset
print("Fake News Dataset:")
print(fake_df.head())



Fake News Dataset:
                                               title  \
0   Donald Trump Sends Out Embarrassing New Year’...   
1   Drunk Bragging Trump Staffer Started Russian ...   
2   Sheriff David Clarke Becomes An Internet Joke...   
3   Trump Is So Obsessed He Even Has Obama’s Name...   
4   Pope Francis Just Called Out Donald Trump Dur...   

                                                text subject  \
0  Donald Trump just couldn t wish all Americans ...    News   
1  House Intelligence Committee Chairman Devin Nu...    News   
2  On Friday, it was revealed that former Milwauk...    News   
3  On Christmas day, Donald Trump announced that ...    News   
4  Pope Francis used his annual Christmas Day mes...    News   

                date  
0  December 31, 2017  
1  December 31, 2017  
2  December 30, 2017  
3  December 29, 2017  
4  December 25, 2017  


In [19]:
print("\nTrue News Dataset:")
print(true_df.head())


True News Dataset:
                                               title  \
0  As U.S. budget fight looms, Republicans flip t...   
1  U.S. military to accept transgender recruits o...   
2  Senior U.S. Republican senator: 'Let Mr. Muell...   
3  FBI Russia probe helped by Australian diplomat...   
4  Trump wants Postal Service to charge 'much mor...   

                                                text       subject  \
0  WASHINGTON (Reuters) - The head of a conservat...  politicsNews   
1  WASHINGTON (Reuters) - Transgender people will...  politicsNews   
2  WASHINGTON (Reuters) - The special counsel inv...  politicsNews   
3  WASHINGTON (Reuters) - Trump campaign adviser ...  politicsNews   
4  SEATTLE/WASHINGTON (Reuters) - President Donal...  politicsNews   

                 date  
0  December 31, 2017   
1  December 29, 2017   
2  December 31, 2017   
3  December 30, 2017   
4  December 29, 2017   



1.2 Add Labels
We will add a label column to each dataset:

label = 0 for fake news.
label = 1 for true news.

In [24]:
# Add labels
fake_df['label'] = 0  # Fake news
true_df['label'] = 1  # True news


In [26]:
fake_df.head()

Unnamed: 0,title,text,subject,date,label
0,Donald Trump Sends Out Embarrassing New Year’...,Donald Trump just couldn t wish all Americans ...,News,"December 31, 2017",0
1,Drunk Bragging Trump Staffer Started Russian ...,House Intelligence Committee Chairman Devin Nu...,News,"December 31, 2017",0
2,Sheriff David Clarke Becomes An Internet Joke...,"On Friday, it was revealed that former Milwauk...",News,"December 30, 2017",0
3,Trump Is So Obsessed He Even Has Obama’s Name...,"On Christmas day, Donald Trump announced that ...",News,"December 29, 2017",0
4,Pope Francis Just Called Out Donald Trump Dur...,Pope Francis used his annual Christmas Day mes...,News,"December 25, 2017",0


In [28]:
true_df.head()

Unnamed: 0,title,text,subject,date,label
0,"As U.S. budget fight looms, Republicans flip t...",WASHINGTON (Reuters) - The head of a conservat...,politicsNews,"December 31, 2017",1
1,U.S. military to accept transgender recruits o...,WASHINGTON (Reuters) - Transgender people will...,politicsNews,"December 29, 2017",1
2,Senior U.S. Republican senator: 'Let Mr. Muell...,WASHINGTON (Reuters) - The special counsel inv...,politicsNews,"December 31, 2017",1
3,FBI Russia probe helped by Australian diplomat...,WASHINGTON (Reuters) - Trump campaign adviser ...,politicsNews,"December 30, 2017",1
4,Trump wants Postal Service to charge 'much mor...,SEATTLE/WASHINGTON (Reuters) - President Donal...,politicsNews,"December 29, 2017",1



1.3 Combine the Datasets
Concatenate the two datasets into a single DataFrame.

In [31]:
# Combine datasets
df = pd.concat([fake_df, true_df], ignore_index=True)

# Shuffle the dataset to mix fake and true news
df = df.sample(frac=1, random_state=42).reset_index(drop=True)

# Display the first few rows of the combined dataset
print("Combined Dataset:")
print(df.head())


Combined Dataset:
                                               title  \
0  Ben Stein Calls Out 9th Circuit Court: Committ...   
1  Trump drops Steve Bannon from National Securit...   
2  Puerto Rico expects U.S. to lift Jones Act shi...   
3   OOPS: Trump Just Accidentally Confirmed He Le...   
4  Donald Trump heads for Scotland to reopen a go...   

                                                text       subject  \
0  21st Century Wire says Ben Stein, reputable pr...       US_News   
1  WASHINGTON (Reuters) - U.S. President Donald T...  politicsNews   
2  (Reuters) - Puerto Rico Governor Ricardo Rosse...  politicsNews   
3  On Monday, Donald Trump once again embarrassed...          News   
4  GLASGOW, Scotland (Reuters) - Most U.S. presid...  politicsNews   

                  date  label  
0    February 13, 2017      0  
1       April 5, 2017       1  
2  September 27, 2017       1  
3         May 22, 2017      0  
4       June 24, 2016       1  



1.4 Explore the Dataset
Perform basic exploration to understand the structure of the data.

In [35]:
# Check the shape of the dataset
print("Dataset Shape:", df.shape)

# Check for missing values
print("\nMissing Values:")
print(df.isnull().sum())

# Check class distribution
print("\nClass Distribution:")
print(df['label'].value_counts())


Dataset Shape: (44898, 5)

Missing Values:
title      0
text       0
subject    0
date       0
label      0
dtype: int64

Class Distribution:
label
0    23481
1    21417
Name: count, dtype: int64


In [47]:
df.columns

Index(['title', 'text', 'subject', 'date', 'label'], dtype='object')

In [51]:
# Keep only the necessary columns: 'text' and 'label'
df = df[['text', 'label']]

# Display the first few rows
print(df.head())


                                                text  label
0  21st Century Wire says Ben Stein, reputable pr...      0
1  WASHINGTON (Reuters) - U.S. President Donald T...      1
2  (Reuters) - Puerto Rico Governor Ricardo Rosse...      1
3  On Monday, Donald Trump once again embarrassed...      0
4  GLASGOW, Scotland (Reuters) - Most U.S. presid...      1
