In [71]:
# Import Dependencies
import pandas as pd
import numpy as np

In [72]:
# Extract Data
fake_df = pd.read_csv('Data/Fake.csv')
true_df = pd.read_csv('Data/True.csv')

## Clean Fake DF

In [73]:
# Headshot
fake_df.head()

Unnamed: 0,title,text,subject,date
0,Donald Trump Sends Out Embarrassing New Year’...,Donald Trump just couldn t wish all Americans ...,News,"December 31, 2017"
1,Drunk Bragging Trump Staffer Started Russian ...,House Intelligence Committee Chairman Devin Nu...,News,"December 31, 2017"
2,Sheriff David Clarke Becomes An Internet Joke...,"On Friday, it was revealed that former Milwauk...",News,"December 30, 2017"
3,Trump Is So Obsessed He Even Has Obama’s Name...,"On Christmas day, Donald Trump announced that ...",News,"December 29, 2017"
4,Pope Francis Just Called Out Donald Trump Dur...,Pope Francis used his annual Christmas Day mes...,News,"December 25, 2017"


In [74]:
# Check for null records
fake_df.count()

title      23481
text       23481
subject    23481
date       23481
dtype: int64

In [75]:
# Number of unique values in each column
fake_df.nunique()

title      17903
text       17455
subject        6
date        1681
dtype: int64

In [76]:
# Number of records under each Subject
fake_df['subject'].value_counts()

News               9050
politics           6841
left-news          4459
Government News    1570
US_News             783
Middle-east         778
Name: subject, dtype: int64

In [77]:
# Titles under Subject = 'News'
fake_df[fake_df['subject'] == 'News']['title']

0        Donald Trump Sends Out Embarrassing New Year’...
1        Drunk Bragging Trump Staffer Started Russian ...
2        Sheriff David Clarke Becomes An Internet Joke...
3        Trump Is So Obsessed He Even Has Obama’s Name...
4        Pope Francis Just Called Out Donald Trump Dur...
                              ...                        
9045     Judge Serves A Crushing Blow To The Florida G...
9046     Bill Cosby Thanks ‘Friends And Fans’ In New Y...
9047     Obama Announces ‘Unfinished Business’ For 201...
9048     Damning New Evidence Shows How Israel Bribed ...
9049     Ben Carson Campaign In Shambles After Top Aid...
Name: title, Length: 9050, dtype: object

In [78]:
# Drop World News
fake_df = fake_df[fake_df['subject'] != 'Middle-east']
fake_df['subject'].value_counts()

News               9050
politics           6841
left-news          4459
Government News    1570
US_News             783
Name: subject, dtype: int64

In [79]:
# Rename all subjects as Politics
fake_df['subject'] = 'Politics'
fake_df['subject'].value_counts()

Politics    22703
Name: subject, dtype: int64

In [80]:
# Drop the date column
fake_df = fake_df.drop(columns='date')
fake_df.columns

Index(['title', 'text', 'subject'], dtype='object')

In [81]:
# Add label column and set value as 1
fake_df['label'] = 1
fake_df.columns

Index(['title', 'text', 'subject', 'label'], dtype='object')

In [82]:
# Overview of transformed dataframe
fake_df.head()

Unnamed: 0,title,text,subject,label
0,Donald Trump Sends Out Embarrassing New Year’...,Donald Trump just couldn t wish all Americans ...,Politics,1
1,Drunk Bragging Trump Staffer Started Russian ...,House Intelligence Committee Chairman Devin Nu...,Politics,1
2,Sheriff David Clarke Becomes An Internet Joke...,"On Friday, it was revealed that former Milwauk...",Politics,1
3,Trump Is So Obsessed He Even Has Obama’s Name...,"On Christmas day, Donald Trump announced that ...",Politics,1
4,Pope Francis Just Called Out Donald Trump Dur...,Pope Francis used his annual Christmas Day mes...,Politics,1


## Clean True DF

In [83]:
# Check for null records
true_df.count()

title      21417
text       21417
subject    21417
date       21417
dtype: int64

In [84]:
# Number of unique values in each column
true_df.nunique()

title      20826
text       21192
subject        2
date         716
dtype: int64

In [85]:
# Number of records under each Subject
true_df['subject'].value_counts()

politicsNews    11272
worldnews       10145
Name: subject, dtype: int64

In [87]:
# Drop World News
true_df = true_df[true_df['subject'] != 'worldnews']
true_df.nunique()

title      11150
text       11214
subject        1
date         716
dtype: int64

In [88]:
# Rename all subjects as Politics
true_df['subject'] = 'Politics'
true_df['subject'].value_counts()

Politics    11272
Name: subject, dtype: int64

In [89]:
# Drop the date column
true_df = true_df.drop(columns='date')
true_df.columns

Index(['title', 'text', 'subject'], dtype='object')

In [90]:
# Add label column and set value as 0
true_df['label'] = 0
true_df.columns

Index(['title', 'text', 'subject', 'label'], dtype='object')

## Merge the Dataframes

In [91]:
dataframes = [fake_df, true_df]
articles_df = pd.concat(dataframes)
articles_df.count()

title      33975
text       33975
subject    33975
label      33975
dtype: int64

## Export the Articles Dataframe

In [92]:
articles_df.to_csv('Data/articles.csv', index = False)