# Data Cleaning

Import libraries:

In [1]:
import pandas as pd
import numpy as np
import os

Load in data:

In [2]:
files = os.listdir('../data')
dataframes = [pd.read_csv(f'../data/{file}') for file in files if file.endswith('_top_articles.csv')]
df = pd.concat(dataframes, ignore_index=True)
df

Unnamed: 0,submission_id,posted_time_stamp,subreddit,title,url,author,upvotes,downvotes,article_text
0,if35i0,2020-08-23 16:26:16,Anarcho_Capitalism,Property rights,https://jssocial.pw/ppkey/fget/x0x7/upload/JQk...,x0x7,2030,476,
1,hgoib5,2020-06-27 09:28:04,Anarcho_Capitalism,leftist.jpg,https://files.catbox.moe/pdzsl5.JPG,R334Cti0N4Ry_Lib3Rty,1397,286,
2,aib9qb,2019-01-21 17:45:06,Anarcho_Capitalism,The ridiculous irony of reddit politics,https://files.catbox.moe/q5fbf3.jpeg,RetroForte,1366,135,
3,57qxn7,2016-10-16 13:50:59,Anarcho_Capitalism,CNN: It's illegal for you to read the leaked e...,https://streamable.com/6g5v,pseudoRndNbr,1351,238,
4,ezuipx,2020-02-06 18:12:10,Anarcho_Capitalism,bernies 2020 in one jpg,https://i2.wp.com/stonetoss.com/wp-content/upl...,,1333,273,
...,...,...,...,...,...,...,...,...,...
2316,9j1xpp,2018-09-26 15:12:57,Republican,"Charlie Daniels: America, It’s Time to Cool Do...",https://www.cnsnews.com/commentary/charlie-dan...,DEYoungRepublicans,524,52,charlie daniels it seems that – for the most p...
2317,d4b2jf,2019-09-15 00:08:37,Republican,VIDEO: Black Trump Supporter Told Not to Wear ...,https://www.thedailypatriot.club/videos/video-...,LisaLuvsYou,523,52,
2318,acth6r,2019-01-05 13:40:33,Republican,Ocasio-Cortez floats 70 percent tax on the sup...,https://www.politico.com/story/2019/01/04/ocas...,The_seph_i_am,520,64,“i think that it only has ever been radicals t...
2319,gay1r8,2020-04-30 18:42:18,Republican,California Republican Party suing state's gove...,https://www.cbsnews.com/news/california-republ...,raffu280,519,45,the california republican party is suing gover...


Remove NA data and articles with <= 50 characters

In [3]:
df.dropna(inplace=True)
df = df[df['article_text'].str.len() > 50]


In [4]:
df.shape

(2037, 9)

We have 2037 articles to analyze with 9 features for each article.

Add in a `lean` variable to categorize each post.

In [5]:
left = ['Democrats','LateStageCapitalism','Liberal','JoeBiden']
right = ['Anarcho_Capitalism','Conservative','donaldtrump','Republican']

df = df.assign(lean=df.subreddit.apply(lambda x: 'left' if x in left else 'right'))



Article counts by subreddit:

In [10]:
df['subreddit'].value_counts()

Liberal                850
Democrats              386
Republican             349
Conservative           325
JoeBiden               103
donaldtrump             14
LateStageCapitalism      9
Anarcho_Capitalism       1
Name: subreddit, dtype: int64

Save cleaned data as a `.csv`.

In [6]:
df.to_csv('../data/clean_data.csv', index=False)