### Import Libraries and Read in Data

In [71]:
import numpy as np 
import pandas as pd

In [72]:
# Sarcasm Data
sarcasm_df = \
pd.read_json("../input/news-headlines-dataset-for-sarcasm-detection/Sarcasm_Headlines_Dataset.json", lines=True)

In [73]:
# PolitiFact Data
pf_fake_df = pd.read_csv("../input/fake-news-data/fnn_politics_fake.csv")
pf_real_df = pd.read_csv("../input/fake-news-data/fnn_politics_real.csv")

In [74]:
# GossipCop Data
gc_fake_df = pd.read_csv("../input/fakenewsnet-gossipcop/gossipcop_fake.csv")
gc_real_df = pd.read_csv("../input/fakenewsnet-gossipcop/gossipcop_real.csv")

In [75]:
# News Articles from 15 different sites

news_articles_df = pd.read_csv("../input/fake-news-data/news_articles_full.csv", encoding='unicode_escape')

In [76]:
# Buzzfeed fake/real news data

buzzf_fake_df = pd.read_csv("../input/fakenewsnet/BuzzFeed_fake_news_content.csv")
buzzf_real_df = pd.read_csv("../input/fakenewsnet/BuzzFeed_real_news_content.csv")

### Merging different data sources into one big data set

In [77]:
# Create target variables for PolitiFact data
pf_real_df['target'] = 'real'
pf_fake_df['target'] = 'fake'

# Merge fake and real PolitiFact data into one
pf_df = pd.concat([pf_real_df, pf_fake_df])

In [78]:
# Drop Unnecessary Columns from PolitiFact data
pf_df = pf_df.copy()[['news_url','title','target']]

# Change name of columns for easier concatenation later
pf_df.columns = ['url','title','target']

In [79]:
# Create target variables for gossipcop data
gc_real_df['target'] = 'real'
gc_fake_df['target'] = 'fake'

# Merge fake and real gossipcop data into one
gc_df = pd.concat([gc_real_df, gc_fake_df])

In [80]:
# Drop Unnecessary Columns from gossipcop data
gc_df = gc_df.copy()[['news_url','title','target']]

# Change name of columns for easier concatenation later
gc_df.columns = ['url','title','target']

In [81]:
# Create target variables for Buzzfeed news data
buzzf_fake_df['target'] = 'fake'
buzzf_real_df['target'] = 'real'

# Merge fake and real buzzfeed news data into one
buzzf_df = pd.concat([buzzf_real_df, buzzf_fake_df])

In [82]:
# Drop Unnecessary Columns from Buzzfeed news data
buzzf_df = buzzf_df.copy()[['title','text','url','target']]

In [83]:
# Change name of columns for easier concatenation later
sarcasm_df.columns = ['url','title','target']

# Replace 0 and 1s to appropriate target labels (e.g. sarcasm, real)
sarcasm_df['target'] = sarcasm_df['target'].replace({0:'real',1:'sarcasm'})

In [84]:
# Create target variables for news articles data
news_articles_df['target'] = 'real'

# Drop Unnecessary Columns from new articles data
news_articles_df = news_articles_df.copy()[['article_source_link','title','text', 'target']] 

# Change name of columns for easier concatenation later
news_articles_df.columns = ['url','title','text','target']

In [90]:
# Merge all wrangled data sources into one big dataset

df = pd.concat([news_articles_df, buzzf_df, sarcasm_df, gc_df, pf_df], sort=True)

### Missing Values

Some datasets did not have "text" columns. There are some missing "titles/headlines" too. For now, we fill them with the character "Missing" and we can encode them as a separate category later.

In [92]:
missing_feats = ['text','title','url']
for feat in missing_feats:
    df[feat].fillna('missing',inplace=True)

In [93]:
# No more missing values!
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 53911 entries, 0 to 431
Data columns (total 4 columns):
target    53911 non-null object
text      53911 non-null object
title     53911 non-null object
url       53911 non-null object
dtypes: object(4)
memory usage: 2.1+ MB


In [None]:
# Save wrangled data locally for later use

df.to_csv('wrangled_full_dataset.csv')