## Import Dependencies

In [9]:
# Import Dependencies
import numpy as np
import pandas as pd
from sqlalchemy import create_engine
# In the same folder/directory, Create a config.py file that contains postgrespwd = '<your postgres password>'
from config import postgrespwd

## Database Connection

In [10]:
# Connecting to Database
# Run "pip install psycopg2-binary" in mlenv
db_string = f"postgres://postgres:{postgrespwd}@localhost:5432/FakeNewsDetector"
engine = create_engine(db_string)

## Data Staging

In [11]:
# Read in the csv data
fake_df = pd.read_csv('resources/Fake.csv')
true_df = pd.read_csv('resources/True.csv')

In [12]:
# Save Raw Data into PostgreSQL
fake_df.to_sql('FakeNews', engine, if_exists='replace', index=False)
true_df.to_sql('TrueNews', engine, if_exists='replace', index=False)

## Data Transformation

### Clean Fake DF

In [13]:
# Headshot
fake_df.head()

Unnamed: 0,title,text,subject,date
0,Donald Trump Sends Out Embarrassing New Year’...,Donald Trump just couldn t wish all Americans ...,News,"December 31, 2017"
1,Drunk Bragging Trump Staffer Started Russian ...,House Intelligence Committee Chairman Devin Nu...,News,"December 31, 2017"
2,Sheriff David Clarke Becomes An Internet Joke...,"On Friday, it was revealed that former Milwauk...",News,"December 30, 2017"
3,Trump Is So Obsessed He Even Has Obama’s Name...,"On Christmas day, Donald Trump announced that ...",News,"December 29, 2017"
4,Pope Francis Just Called Out Donald Trump Dur...,Pope Francis used his annual Christmas Day mes...,News,"December 25, 2017"


In [14]:
# Check for null records
fake_df.count()

title      23481
text       23481
subject    23481
date       23481
dtype: int64

In [15]:
# Delete articles with blank(" ") Text

In [16]:
# Number of unique values in each column
fake_df.nunique()

title      17903
text       17455
subject        6
date        1681
dtype: int64

In [17]:
# Number of records under each Subject
fake_df['subject'].value_counts()

News               9050
politics           6841
left-news          4459
Government News    1570
US_News             783
Middle-east         778
Name: subject, dtype: int64

In [18]:
# Titles under Subject = 'News'
fake_df[fake_df['subject'] == 'News']['title']

0        Donald Trump Sends Out Embarrassing New Year’...
1        Drunk Bragging Trump Staffer Started Russian ...
2        Sheriff David Clarke Becomes An Internet Joke...
3        Trump Is So Obsessed He Even Has Obama’s Name...
4        Pope Francis Just Called Out Donald Trump Dur...
                              ...                        
9045     Judge Serves A Crushing Blow To The Florida G...
9046     Bill Cosby Thanks ‘Friends And Fans’ In New Y...
9047     Obama Announces ‘Unfinished Business’ For 201...
9048     Damning New Evidence Shows How Israel Bribed ...
9049     Ben Carson Campaign In Shambles After Top Aid...
Name: title, Length: 9050, dtype: object

In [19]:
# Rename Subjects as 'US News' and 'World News'
fake_df['subject'] = fake_df['subject'].replace(['News','politics','left-news','Government News','US_News'],'US News')
fake_df['subject'] = fake_df['subject'].replace(['Middle-east'],'World News')
fake_df['subject'].value_counts()

US News       22703
World News      778
Name: subject, dtype: int64

In [20]:
# Drop the date column
fake_df = fake_df.drop(columns='date')
fake_df.columns

Index(['title', 'text', 'subject'], dtype='object')

In [21]:
# Add label column and set value as 1
fake_df['label'] = 1
fake_df.columns

Index(['title', 'text', 'subject', 'label'], dtype='object')

In [22]:
# Overview of transformed dataframe
fake_df.head()

Unnamed: 0,title,text,subject,label
0,Donald Trump Sends Out Embarrassing New Year’...,Donald Trump just couldn t wish all Americans ...,US News,1
1,Drunk Bragging Trump Staffer Started Russian ...,House Intelligence Committee Chairman Devin Nu...,US News,1
2,Sheriff David Clarke Becomes An Internet Joke...,"On Friday, it was revealed that former Milwauk...",US News,1
3,Trump Is So Obsessed He Even Has Obama’s Name...,"On Christmas day, Donald Trump announced that ...",US News,1
4,Pope Francis Just Called Out Donald Trump Dur...,Pope Francis used his annual Christmas Day mes...,US News,1


### Clean True DF

In [23]:
# Check for null records
true_df.count()

title      21417
text       21417
subject    21417
date       21417
dtype: int64

In [24]:
# Delete articles with blank(" ") Text

In [25]:
# Number of unique values in each column
true_df.nunique()

title      20826
text       21192
subject        2
date         716
dtype: int64

In [26]:
# Number of records under each Subject
true_df['subject'].value_counts()

politicsNews    11272
worldnews       10145
Name: subject, dtype: int64

In [27]:
# Rename Subjects as 'US News' and 'World News'
true_df['subject'] = true_df['subject'].replace(['politicsNews'],'US News')
true_df['subject'] = true_df['subject'].replace(['worldnews'],'World News')
true_df['subject'].value_counts()

US News       11272
World News    10145
Name: subject, dtype: int64

In [28]:
# Drop the date column
true_df = true_df.drop(columns='date')
true_df.columns

Index(['title', 'text', 'subject'], dtype='object')

In [29]:
# Add label column and set value as 0
true_df['label'] = 0
true_df.columns

Index(['title', 'text', 'subject', 'label'], dtype='object')

### Merge the Dataframes

In [30]:
dataframes = [fake_df, true_df]
articles_df = pd.concat(dataframes)
articles_df.count()

title      44898
text       44898
subject    44898
label      44898
dtype: int64

### Export the Articles Dataframe to Database

In [31]:
articles_df.to_sql('Articles', engine, if_exists='replace',index=False)

## Natural Language Processing

In [32]:
# NLP Code goes here

In [34]:
# Export NLP Output to Postgres
<dataframe>.to_sql('<table>', engine, if_exists='replace',index=False)

## Machine Learning Model

In [35]:
# ML Code goes here

In [36]:
# Export ML Output to Postgres
<dataframe>.to_sql('<table>', engine, if_exists='replace',index=False)