In [9]:
import pandas as pd
import matplotlib.pyplot as plt
from config import config as cfg
from config import utils

In [6]:
data_folder = utils.get_repo_path() / cfg.DATA_FOLDER

In [20]:
# manually importing the files because the datasets structures are inconsistent
# the comments keep track of different datasets, because they have multiple files
# for whatever reason
clickbait_data = pd.read_csv(data_folder / "clickbait_data.csv") # 1
evaluation = pd.read_csv(data_folder / "evaluation.csv", sep=';') # 2
test1 = pd.read_csv(data_folder / "test (1).csv", sep=';')
train2 = pd.read_csv(data_folder / "train (2).csv", sep=';')
true_data = pd.read_csv(data_folder / "true.csv") # 3
fake_data = pd.read_csv(data_folder / "fake.csv")
fake_news_net = pd.read_csv(data_folder / "FakeNewsNet.csv") # 4

In [24]:
clickbait_data.head()

Unnamed: 0,headline,clickbait
0,Should I Get Bings,1
1,Which TV Female Friend Group Do You Belong In,1
2,"The New ""Star Wars: The Force Awakens"" Trailer...",1
3,"This Vine Of New York On ""Celebrity Big Brothe...",1
4,A Couple Did A Stunning Photo Shoot With Their...,1


This one doesn't need changing anything.

In [25]:
evaluation.head()

Unnamed: 0.1,Unnamed: 0,title,text,label
0,0,"Sanders back in U.S. Senate, blasts 'coloniali...",WASHINGTON (Reuters) - Democratic U.S. preside...,1
1,1,Kremlin: Syria peoples' congress being 'active...,MOSCOW (Reuters) - A proposal to convene a con...,1
2,2,Oregon Cop Convicted Of Shattering Biker’s Co...,"In a baffling fit of rage, an Oregon State Pol...",0
3,3,Twitter Erupts With Glee Over #CruzSexScandal...,The last thing any politician running for the ...,0
4,4,MUST WATCH VIDEO: Obama Tries To Trash Trump B...,This is too good to miss! Mr. Teleprompter did...,0


In [38]:
evaluation.drop(columns="Unnamed: 0", inplace=True)
test1.drop(columns="Unnamed: 0", inplace=True)
train2.drop(columns="Unnamed: 0", inplace=True)

We can combine those three while keeping only the unique ones, the dataset is large enough that some unfortunate repeats can happen.
Also, while we're at it - let's remove the beginning text from the 'text' column, if it starts with "\<CITY NAME\> (REUTERS) ".

In [45]:
combined_df = pd.concat([evaluation, test1, train2], ignore_index=True)
print(len(combined_df))
df = combined_df.drop_duplicates(subset='title')
print(len(df))
df.loc[:, 'text'] = df['text'].str.replace(r'^[A-Z\s]+?\s*\(Reuters\)\s*-\s*', '', regex=True)


40587
39997


In [46]:
df

Unnamed: 0,title,text,label
0,"Sanders back in U.S. Senate, blasts 'coloniali...",Democratic U.S. presidential hopeful Bernie Sa...,1
1,Kremlin: Syria peoples' congress being 'active...,A proposal to convene a congress of all Syria ...,1
2,Oregon Cop Convicted Of Shattering Biker’s Co...,"In a baffling fit of rage, an Oregon State Pol...",0
3,Twitter Erupts With Glee Over #CruzSexScandal...,The last thing any politician running for the ...,0
4,MUST WATCH VIDEO: Obama Tries To Trash Trump B...,This is too good to miss! Mr. Teleprompter did...,0
...,...,...,...
40582,Mexico Senate committee OK's air transport dea...,A key committee in Mexico’s Senate on Thursday...,1
40583,BREAKING: HILLARY CLINTON’S STATE DEPARTMENT G...,IF SHE S NOT TOAST NOW THEN WE RE IN BIGGER TR...,0
40584,trump breaks from stump speech to admire beaut...,kremlin nato was created for agression \nruss...,0
40585,NFL PLAYER Delivers Courageous Message: Stop B...,Dallas Cowboys star wide receiver Dez Bryant t...,0


Ready for usage! Onto the other two...

In [55]:
true_data.head()

Unnamed: 0,title,text,subject,date
0,Donald Trump Sends Out Embarrassing New Year’...,Donald Trump just couldn t wish all Americans ...,News,"December 31, 2017"
1,Drunk Bragging Trump Staffer Started Russian ...,House Intelligence Committee Chairman Devin Nu...,News,"December 31, 2017"
2,Sheriff David Clarke Becomes An Internet Joke...,"On Friday, it was revealed that former Milwauk...",News,"December 30, 2017"
3,Trump Is So Obsessed He Even Has Obama’s Name...,"On Christmas day, Donald Trump announced that ...",News,"December 29, 2017"
4,Pope Francis Just Called Out Donald Trump Dur...,Pope Francis used his annual Christmas Day mes...,News,"December 25, 2017"


In [None]:
fake_data.head()

In [None]:
true_data['label'] = 1
true_data.drop(columns='date', inplace=True) # won't be needed
fake_data.['label'] = 0
fake_data.drop(columns='date', inplace=True)

In [58]:
df2 = pd.concat([true_data, fake_data], ignore_index=True)

Let's maybe try to concat it with the previous one?

In [59]:
df3 = pd.concat([df, df2.drop(columns="subject")])

In [61]:
df4 = df3.drop_duplicates(subset='title')

In [62]:
len(df4)

41037

In [63]:
len(df)

39997

There are a lot of duplicates in our datasets. Therefore let's use the combined one, but also let's save the dataframe that had the subjects listed, will be useful for data analysis.

In [64]:
df4.to_csv(data_folder / "fakenews_withtext.csv")
df3.drop(columns=["title", "text"]).to_csv(data_folder / "fakenews_subjects.csv")

And let's check the last one.

In [66]:
fake_news_net.head()

Unnamed: 0,title,news_url,source_domain,tweet_num,real
0,Kandi Burruss Explodes Over Rape Accusation on...,http://toofab.com/2017/05/08/real-housewives-a...,toofab.com,42,1
1,People's Choice Awards 2018: The best red carp...,https://www.today.com/style/see-people-s-choic...,www.today.com,0,1
2,Sophia Bush Sends Sweet Birthday Message to 'O...,https://www.etonline.com/news/220806_sophia_bu...,www.etonline.com,63,1
3,Colombian singer Maluma sparks rumours of inap...,https://www.dailymail.co.uk/news/article-33655...,www.dailymail.co.uk,20,1
4,Gossip Girl 10 Years Later: How Upper East Sid...,https://www.zerchoo.com/entertainment/gossip-g...,www.zerchoo.com,38,1


The news URL seems completely unneccessary. Let's keep the other columns, though.

In [68]:
fake_news_net.drop(columns="news_url").to_csv(data_folder / "fakenews_notext.csv")