In [1]:
import pandas as pd
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
import re
#import numpy as np
#import seaborn as sns
#import matplotlib.pyplot as plt
#from sklearn.feature_extraction.text import TfidfVectorizer
#from sklearn.model_selection import train_test_split
#from sklearn.linear_model import PassiveAggressiveClassifier
#from sklearn.metrics import accuracy_score
#from sklearn.metrics import classification_report 
#import string 
#import openai


In [2]:
# Set max row and column display
pd.options.display.max_rows=None
pd.options.display.max_columns=None

In [3]:
# Set paths to Real and Fake article CSV files
file_fake = "Resources/fake_articles.csv"
file_real = "Resources/real_articles.csv"

In [4]:
# Read the Real and Fake article CSV files CSV files into DataFrames
df_fake_articles = pd.read_csv(file_fake)
df_real_articles = pd.read_csv(file_real)

In [5]:
# Print first 5 rows of df_fake_articles dataframe
df_fake_articles.head()

Unnamed: 0,title,text,subject,date
0,Donald Trump Sends Out Embarrassing New Year’...,Donald Trump just couldn t wish all Americans ...,News,"December 31, 2017"
1,Drunk Bragging Trump Staffer Started Russian ...,House Intelligence Committee Chairman Devin Nu...,News,"December 31, 2017"
2,Sheriff David Clarke Becomes An Internet Joke...,"On Friday, it was revealed that former Milwauk...",News,"December 30, 2017"
3,Trump Is So Obsessed He Even Has Obama’s Name...,"On Christmas day, Donald Trump announced that ...",News,"December 29, 2017"
4,Pope Francis Just Called Out Donald Trump Dur...,Pope Francis used his annual Christmas Day mes...,News,"December 25, 2017"


In [6]:
# Display shape of df_fake_articles dataframe
print(f'Shape of df_fake_articles: {df_fake_articles.shape}')

Shape of df_fake_articles: (23481, 4)


In [7]:
# Print first 5 rows of df_real_articles dataframe
df_real_articles.head()

Unnamed: 0,title,text,subject,date
0,"As U.S. budget fight looms, Republicans flip t...",WASHINGTON (Reuters) - The head of a conservat...,politicsNews,31-Dec-17
1,U.S. military to accept transgender recruits o...,WASHINGTON (Reuters) - Transgender people will...,politicsNews,29-Dec-17
2,Senior U.S. Republican senator: 'Let Mr. Muell...,WASHINGTON (Reuters) - The special counsel inv...,politicsNews,31-Dec-17
3,FBI Russia probe helped by Australian diplomat...,WASHINGTON (Reuters) - Trump campaign adviser ...,politicsNews,30-Dec-17
4,Trump wants Postal Service to charge 'much mor...,SEATTLE/WASHINGTON (Reuters) - President Donal...,politicsNews,29-Dec-17


In [8]:
# Display shape of df_real_articles dataframe
print(f'Shape of df_real_articles: {df_real_articles.shape}')

Shape of df_real_articles: (21417, 4)


In [9]:
# Display the dataframe information
df_fake_articles.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 23481 entries, 0 to 23480
Data columns (total 4 columns):
 #   Column   Non-Null Count  Dtype 
---  ------   --------------  ----- 
 0   title    23481 non-null  object
 1   text     23481 non-null  object
 2   subject  23481 non-null  object
 3   date     23481 non-null  object
dtypes: object(4)
memory usage: 733.9+ KB


In [10]:
# Display the df_fake_articles subject column values
df_fake_articles['subject'].unique()

array(['News', 'politics', 'Government News', 'left-news', 'US_News',
       'Middle-east'], dtype=object)

In [11]:
# Convert df_fake_articles date column from object to datetime
df_fake_articles['date'] = pd.to_datetime(df_fake_articles['date'],errors='coerce')

In [12]:
# Display first 5 rows of the df_fake_articles dataframe information after datetime conversion
df_fake_articles.head()

Unnamed: 0,title,text,subject,date
0,Donald Trump Sends Out Embarrassing New Year’...,Donald Trump just couldn t wish all Americans ...,News,2017-12-31
1,Drunk Bragging Trump Staffer Started Russian ...,House Intelligence Committee Chairman Devin Nu...,News,2017-12-31
2,Sheriff David Clarke Becomes An Internet Joke...,"On Friday, it was revealed that former Milwauk...",News,2017-12-30
3,Trump Is So Obsessed He Even Has Obama’s Name...,"On Christmas day, Donald Trump announced that ...",News,2017-12-29
4,Pope Francis Just Called Out Donald Trump Dur...,Pope Francis used his annual Christmas Day mes...,News,2017-12-25


In [13]:
# Display the df_fake_articles dataframe information after datetime conversion to confirm there are no nulls
df_fake_articles.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 23481 entries, 0 to 23480
Data columns (total 4 columns):
 #   Column   Non-Null Count  Dtype         
---  ------   --------------  -----         
 0   title    23481 non-null  object        
 1   text     23481 non-null  object        
 2   subject  23481 non-null  object        
 3   date     11868 non-null  datetime64[ns]
dtypes: datetime64[ns](1), object(3)
memory usage: 733.9+ KB


In [14]:
# The df_fake_articles date column contained a lot of bad dates that were converted to nulls,
# only 11868 of 23481 rows contain dates. We will review some of the rows with null dates to
# see if the articles seem valid.
df_fake_null_date_rows = df_fake_articles.loc[df_fake_articles['date'].isnull()]
print(df_fake_null_date_rows.head())

                                                  title  \
9050  Democrat Senator Warns Mueller Not To Release ...   
9051  MSNBC ANCHOR Flabbergasted at What Texas Teach...   
9052  WATCH: SNOWFLAKES ASKED Communist Party Platfo...   
9053  JUST IN: BADASS GENERAL JOHN KELLY Shoved Chin...   
9054  DOJ’s JEFF SESSIONS Opens Investigation Into W...   

                                                   text   subject date  
9050  According to The Hill, Democrat Senator Bob Ca...  politics  NaT  
9051  If we protect every other government building ...  politics  NaT  
9052  Ami Horowitz is fantastic! Check out this man ...  politics  NaT  
9053  Just one more reminder of why President Trump ...  politics  NaT  
9054  Thank goodnesss Jeff Sessions is moving on fin...  politics  NaT  


In [15]:
# The df_fake_articles columns for the rows with a null date look good.  Our analysis is not be based on
#  the date, so we will drop the date column from our dataframe and continue with our analysis.
df_fake_articles_revised = df_fake_articles.drop('date', axis=1)
print(df_fake_articles_revised.head())
print(df_fake_articles_revised.info())

                                               title  \
0   Donald Trump Sends Out Embarrassing New Year’...   
1   Drunk Bragging Trump Staffer Started Russian ...   
2   Sheriff David Clarke Becomes An Internet Joke...   
3   Trump Is So Obsessed He Even Has Obama’s Name...   
4   Pope Francis Just Called Out Donald Trump Dur...   

                                                text subject  
0  Donald Trump just couldn t wish all Americans ...    News  
1  House Intelligence Committee Chairman Devin Nu...    News  
2  On Friday, it was revealed that former Milwauk...    News  
3  On Christmas day, Donald Trump announced that ...    News  
4  Pope Francis used his annual Christmas Day mes...    News  
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 23481 entries, 0 to 23480
Data columns (total 3 columns):
 #   Column   Non-Null Count  Dtype 
---  ------   --------------  ----- 
 0   title    23481 non-null  object
 1   text     23481 non-null  object
 2   subject  23481 non-null 

In [16]:
# Display the dataframe information
df_real_articles.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 21417 entries, 0 to 21416
Data columns (total 4 columns):
 #   Column   Non-Null Count  Dtype 
---  ------   --------------  ----- 
 0   title    21417 non-null  object
 1   text     21417 non-null  object
 2   subject  21417 non-null  object
 3   date     21417 non-null  object
dtypes: object(4)
memory usage: 669.4+ KB


In [17]:
# Display the df_real_articles subject column values
df_real_articles['subject'].unique()

array(['politicsNews', 'worldnews'], dtype=object)

In [18]:
# Convert df_real_articles date column from object to datetime
df_real_articles['date'] = pd.to_datetime(df_real_articles['date'],errors='coerce')

  df_real_articles['date'] = pd.to_datetime(df_real_articles['date'],errors='coerce')


In [19]:
# Print first 5 rows of updated df_real_articles dataframe after date conversion
df_real_articles.head()

Unnamed: 0,title,text,subject,date
0,"As U.S. budget fight looms, Republicans flip t...",WASHINGTON (Reuters) - The head of a conservat...,politicsNews,2017-12-31
1,U.S. military to accept transgender recruits o...,WASHINGTON (Reuters) - Transgender people will...,politicsNews,2017-12-29
2,Senior U.S. Republican senator: 'Let Mr. Muell...,WASHINGTON (Reuters) - The special counsel inv...,politicsNews,2017-12-31
3,FBI Russia probe helped by Australian diplomat...,WASHINGTON (Reuters) - Trump campaign adviser ...,politicsNews,2017-12-30
4,Trump wants Postal Service to charge 'much mor...,SEATTLE/WASHINGTON (Reuters) - President Donal...,politicsNews,2017-12-29


In [20]:
# Display the df_real_articles dataframe information after datetime conversion to confirm there are no nulls
df_real_articles.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 21417 entries, 0 to 21416
Data columns (total 4 columns):
 #   Column   Non-Null Count  Dtype         
---  ------   --------------  -----         
 0   title    21417 non-null  object        
 1   text     21417 non-null  object        
 2   subject  21417 non-null  object        
 3   date     21417 non-null  datetime64[ns]
dtypes: datetime64[ns](1), object(3)
memory usage: 669.4+ KB


In [21]:
# All df_real_articles dates converted successfully.  Our analysis is not be based on the date,
# so we will drop the date column from our dataframe to be consistent with the df_real_articles
# dataframe and continue with our analysis.
df_real_articles_revised = df_real_articles.drop('date', axis=1)
print(df_real_articles_revised.head())
print(df_real_articles_revised.info())

                                               title  \
0  As U.S. budget fight looms, Republicans flip t...   
1  U.S. military to accept transgender recruits o...   
2  Senior U.S. Republican senator: 'Let Mr. Muell...   
3  FBI Russia probe helped by Australian diplomat...   
4  Trump wants Postal Service to charge 'much mor...   

                                                text       subject  
0  WASHINGTON (Reuters) - The head of a conservat...  politicsNews  
1  WASHINGTON (Reuters) - Transgender people will...  politicsNews  
2  WASHINGTON (Reuters) - The special counsel inv...  politicsNews  
3  WASHINGTON (Reuters) - Trump campaign adviser ...  politicsNews  
4  SEATTLE/WASHINGTON (Reuters) - President Donal...  politicsNews  
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 21417 entries, 0 to 21416
Data columns (total 3 columns):
 #   Column   Non-Null Count  Dtype 
---  ------   --------------  ----- 
 0   title    21417 non-null  object
 1   text     21417 non-null  

In [22]:
# Add a new column "class" to each DataFrame and set values for Real and Fake articles
# class:  Fake = 0;  Real = 1
df_fake_articles_revised['class'] = 0
df_real_articles_revised['class'] = 1


In [23]:
# Print the first 5 rows and info of df_fake_articles_revised dataframe to verify the changes
print("df_fake_articles_revised dataframe:")
print(df_fake_articles_revised.head())
print('\n')
print("df_fake_articles_revised info:")
print(df_fake_articles_revised.info())

df_fake_articles_revised dataframe:
                                               title  \
0   Donald Trump Sends Out Embarrassing New Year’...   
1   Drunk Bragging Trump Staffer Started Russian ...   
2   Sheriff David Clarke Becomes An Internet Joke...   
3   Trump Is So Obsessed He Even Has Obama’s Name...   
4   Pope Francis Just Called Out Donald Trump Dur...   

                                                text subject  class  
0  Donald Trump just couldn t wish all Americans ...    News      0  
1  House Intelligence Committee Chairman Devin Nu...    News      0  
2  On Friday, it was revealed that former Milwauk...    News      0  
3  On Christmas day, Donald Trump announced that ...    News      0  
4  Pope Francis used his annual Christmas Day mes...    News      0  


df_fake_articles_revised info:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 23481 entries, 0 to 23480
Data columns (total 4 columns):
 #   Column   Non-Null Count  Dtype 
---  ------   -------------- 

In [24]:
# Print the first 5 rows and shape of df_real_articles_revised dataframe to verify the changes
print("df_real_articles_revised dataframe:")
print(df_real_articles_revised.head())
print('\n')
print("df_real_articles_revised info:")
print(df_real_articles_revised.info())

df_real_articles_revised dataframe:
                                               title  \
0  As U.S. budget fight looms, Republicans flip t...   
1  U.S. military to accept transgender recruits o...   
2  Senior U.S. Republican senator: 'Let Mr. Muell...   
3  FBI Russia probe helped by Australian diplomat...   
4  Trump wants Postal Service to charge 'much mor...   

                                                text       subject  class  
0  WASHINGTON (Reuters) - The head of a conservat...  politicsNews      1  
1  WASHINGTON (Reuters) - Transgender people will...  politicsNews      1  
2  WASHINGTON (Reuters) - The special counsel inv...  politicsNews      1  
3  WASHINGTON (Reuters) - Trump campaign adviser ...  politicsNews      1  
4  SEATTLE/WASHINGTON (Reuters) - President Donal...  politicsNews      1  


df_real_articles_revised info:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 21417 entries, 0 to 21416
Data columns (total 4 columns):
 #   Column   Non-Null Count  

In [25]:
# Concatenate df_fake_articles and df_real_articles dataframes into new df_merged_articles dataframe
df_merged_articles = pd.concat([df_fake_articles_revised, df_real_articles_revised], ignore_index=True)

# Display first and last 5 rows of df_merged_articles dataframe
print(df_merged_articles.head(5))
print(df_merged_articles.tail(5))

                                               title  \
0   Donald Trump Sends Out Embarrassing New Year’...   
1   Drunk Bragging Trump Staffer Started Russian ...   
2   Sheriff David Clarke Becomes An Internet Joke...   
3   Trump Is So Obsessed He Even Has Obama’s Name...   
4   Pope Francis Just Called Out Donald Trump Dur...   

                                                text subject  class  
0  Donald Trump just couldn t wish all Americans ...    News      0  
1  House Intelligence Committee Chairman Devin Nu...    News      0  
2  On Friday, it was revealed that former Milwauk...    News      0  
3  On Christmas day, Donald Trump announced that ...    News      0  
4  Pope Francis used his annual Christmas Day mes...    News      0  
                                                   title  \
44893  'Fully committed' NATO backs new U.S. approach...   
44894  LexisNexis withdrew two products from Chinese ...   
44895  Minsk cultural hub becomes haven from authorities   
448

In [26]:
# Verify "class" column after concatenation
df_merged_articles['class'].value_counts()

class
0    23481
1    21417
Name: count, dtype: int64

In [27]:
# Display df_merged_articles shape
df_merged_articles.shape

(44898, 4)

In [28]:
# Display info for df_merged_articles dataframe
df_merged_articles.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 44898 entries, 0 to 44897
Data columns (total 4 columns):
 #   Column   Non-Null Count  Dtype 
---  ------   --------------  ----- 
 0   title    44898 non-null  object
 1   text     44898 non-null  object
 2   subject  44898 non-null  object
 3   class    44898 non-null  int64 
dtypes: int64(1), object(3)
memory usage: 1.4+ MB


In [29]:
# Define function to clean articles by convering all text to lower case, removing unnecessary punctuation,
# removing numbers, stopwords, tokenizing, and Lemmatizing the data. 
def clean_text(text):
    # Convert text to lowercase
    text = text.lower()

    # Remove URLs
    text = re.sub(r'https?://\S+|www\.\S+', '', text)

    # Remove HTML tags
    text = re.sub(r'<.*?>', '', text)

    # Remove punctuation
    text = re.sub(r'[^\w\s]', '', text)

    # Remove numbers
    text = re.sub(r'\d+', '', text)

    # Tokenize the text
    words = word_tokenize(text)

    # Remove stopwords
    stop_words = set(stopwords.words('english'))
    words = [word for word in words if word not in stop_words]
    
    # Initialize the WordNet lemmatizer
    lemmatizer = WordNetLemmatizer()
    
    # Lemmatize the words
    words = [lemmatizer.lemmatize(word) for word in words]

    # Join the words back into a single string
    cleaned_text = ' '.join(words)
    
    return cleaned_text

In [30]:
# Test clean_text function to confirm that it works as expected 
dirty_text = "This is an example sentence with some numbers like 123, and special characters !@#$, as well as URLs https://www.example.com"
cleaned_text = clean_text(dirty_text)
print(cleaned_text)

example sentence number like special character well url


In [31]:
# Create new cleaned_text column by applying the clean_text function to the article text 
df_merged_articles['cleaned_text'] = df_merged_articles['text'].apply(clean_text)
df_merged_articles.head()

Unnamed: 0,title,text,subject,class,cleaned_text
0,Donald Trump Sends Out Embarrassing New Year’...,Donald Trump just couldn t wish all Americans ...,News,0,donald trump wish american happy new year leav...
1,Drunk Bragging Trump Staffer Started Russian ...,House Intelligence Committee Chairman Devin Nu...,News,0,house intelligence committee chairman devin nu...
2,Sheriff David Clarke Becomes An Internet Joke...,"On Friday, it was revealed that former Milwauk...",News,0,friday revealed former milwaukee sheriff david...
3,Trump Is So Obsessed He Even Has Obama’s Name...,"On Christmas day, Donald Trump announced that ...",News,0,christmas day donald trump announced would bac...
4,Pope Francis Just Called Out Donald Trump Dur...,Pope Francis used his annual Christmas Day mes...,News,0,pope francis used annual christmas day message...


In [32]:
# Display columns of df_merged_articles dataframe after new cleaned_text column was added 
df_merged_articles.columns

Index(['title', 'text', 'subject', 'class', 'cleaned_text'], dtype='object')

In [33]:
# Reorder dataframe columns to create df_merged_articles_clean dataframe before saving for use by other notebooks
reorder_cols = ['title', 'text', 'cleaned_text', 'subject', 'class']

# Create new df_merged_articles_clean dataframe
df_merged_articles_clean = df_merged_articles[reorder_cols]

# Display first 5 rows of df_merged_articles_clean dataframe
df_merged_articles_clean.head()

Unnamed: 0,title,text,cleaned_text,subject,class
0,Donald Trump Sends Out Embarrassing New Year’...,Donald Trump just couldn t wish all Americans ...,donald trump wish american happy new year leav...,News,0
1,Drunk Bragging Trump Staffer Started Russian ...,House Intelligence Committee Chairman Devin Nu...,house intelligence committee chairman devin nu...,News,0
2,Sheriff David Clarke Becomes An Internet Joke...,"On Friday, it was revealed that former Milwauk...",friday revealed former milwaukee sheriff david...,News,0
3,Trump Is So Obsessed He Even Has Obama’s Name...,"On Christmas day, Donald Trump announced that ...",christmas day donald trump announced would bac...,News,0
4,Pope Francis Just Called Out Donald Trump Dur...,Pope Francis used his annual Christmas Day mes...,pope francis used annual christmas day message...,News,0


In [34]:
# Display info df_merged_articles_clean dataframe
df_merged_articles_clean.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 44898 entries, 0 to 44897
Data columns (total 5 columns):
 #   Column        Non-Null Count  Dtype 
---  ------        --------------  ----- 
 0   title         44898 non-null  object
 1   text          44898 non-null  object
 2   cleaned_text  44898 non-null  object
 3   subject       44898 non-null  object
 4   class         44898 non-null  int64 
dtypes: int64(1), object(4)
memory usage: 1.7+ MB


In [35]:
# Save df_merged_articles_clean dataframe after data cleaning in 4 parts to stay within Github allowed
# filesize requirements
df_merged_articles_clean.iloc[:12250].to_csv('Resources/df_merged_articles_clean_part1.csv', index=False)
print("df_merged_articles_clean_part1.csv file saved")
df_merged_articles_clean.iloc[12250:24500].to_csv('Resources/df_merged_articles_clean_part2.csv', index=False)
print("df_merged_articles_clean_part2.csv file saved")
df_merged_articles_clean.iloc[24500:36750].to_csv('Resources/df_merged_articles_clean_part3.csv', index=False)
print("df_merged_articles_clean_part3.csv file saved")
df_merged_articles_clean.iloc[36750:].to_csv('Resources/df_merged_articles_clean_part4.csv', index=False)
print("df_merged_articles_clean_part4.csv file saved")

df_merged_articles_clean_part1.csv file saved
df_merged_articles_clean_part2.csv file saved
df_merged_articles_clean_part3.csv file saved
df_merged_articles_clean_part4.csv file saved
