# Cleaning Reddit Data
- The full pull will have duplicates that need to be removed
- This notebook should only be run 1 time as it will overwrite the data

In [50]:
import pandas as pd

In [51]:
## Load data
nsc_posts = pd.read_csv('../Reddit Data/nsc_posts_final.csv')
nsc_comments = pd.read_csv('../Reddit Data/nsc_comments_final.csv')
tt_posts = pd.read_csv('../Reddit Data/tt_posts_final.csv')
tt_comments = pd.read_csv('../Reddit Data/tt_comments_final.csv')
preds_posts = pd.read_csv('../Reddit Data/preds_posts_final.csv')
preds_comments = pd.read_csv('../Reddit Data/preds_comments_final.csv')

In [52]:
print(nsc_posts.columns)
print(nsc_comments.columns)

Index(['Unnamed: 0', 'id', 'Title', 'Content', 'Author', 'Post Date'], dtype='object')
Index(['Unnamed: 0', 'Comment ID', 'Parent Comment ID', 'Text', 'Author',
       'Date', 'Post ID'],
      dtype='object')


In [53]:
print(nsc_posts.shape)
print(nsc_comments.shape)
print(tt_posts.shape)
print(tt_comments.shape)
print(preds_posts.shape)
print(preds_comments.shape)

(3941, 6)
(54232, 7)
(3831, 6)
(155541, 7)
(3658, 6)
(62904, 7)


In [54]:
## Removing duplicates from all dataframes
nsc_posts.drop_duplicates(subset = ['id'], inplace = True)
nsc_comments.drop_duplicates(subset = ['Comment ID'], inplace = True)
tt_posts.drop_duplicates(subset = ['id'], inplace = True)
tt_comments.drop_duplicates(subset = ['Comment ID'], inplace = True)
preds_posts.drop_duplicates(subset = ['id'], inplace = True)
preds_comments.drop_duplicates(subset = ['Comment ID'], inplace = True)

In [55]:
## Clean preds data from bots
preds_bots = ['subredditsummarybot', 'AutoModerator']
preds_posts = preds_posts[~preds_posts['Author'].isin(preds_bots)]
preds_comments = preds_comments[~preds_comments['Author'].isin(preds_bots)]

In [56]:
## Filter by date
nsc_posts['Post Date'] = pd.to_datetime(nsc_posts['Post Date'])
tt_posts['Post Date'] = pd.to_datetime(tt_posts['Post Date'])
preds_posts['Post Date'] = pd.to_datetime(preds_posts['Post Date'])

nsc_posts = nsc_posts[nsc_posts['Post Date'].dt.year >= 2022]
tt_posts = tt_posts[tt_posts['Post Date'].dt.year >= 2022]
preds_posts = preds_posts[preds_posts['Post Date'].dt.year >= 2022]

In [57]:
## Making sure all rows of comments dataframes are from posts in Posts dataframes

nsc_ids = nsc_posts['id'].unique()
nsc_comments = nsc_comments[nsc_comments['Post ID'].isin(nsc_ids)].copy()
tt_ids = tt_posts['id'].unique()
tt_comments = tt_comments[tt_comments['Post ID'].isin(tt_ids)].copy()
preds_ids = preds_posts['id'].unique()
preds_comments = preds_comments[preds_comments['Post ID'].isin(preds_ids)].copy()

In [58]:
## Removing posts with no comments
nsc_ids_posts = set(nsc_posts['id'].unique())
nsc_ids_comments = set(nsc_comments['Post ID'].unique())
nsc_diff = nsc_ids_posts.symmetric_difference(nsc_ids_comments)
tt_ids_posts = set(tt_posts['id'].unique())
tt_ids_comments = set(tt_comments['Post ID'].unique())
tt_diff = tt_ids_posts.symmetric_difference(tt_ids_comments)
preds_ids_posts = set(preds_posts['id'].unique())
preds_ids_comments = set(preds_comments['Post ID'].unique())
preds_diff = preds_ids_posts.symmetric_difference(preds_ids_comments)

nsc_posts = nsc_posts[~nsc_posts['id'].isin(nsc_diff)]
tt_posts = tt_posts[~tt_posts['id'].isin(tt_diff)]
preds_posts = preds_posts[~preds_posts['id'].isin(preds_diff)]

In [59]:
# Recheck sizes
print(nsc_posts.shape)
print(nsc_comments.shape)
print(tt_posts.shape)
print(tt_comments.shape)
print(preds_posts.shape)
print(preds_comments.shape)

(1156, 6)
(19377, 7)
(1556, 6)
(68284, 7)
(969, 6)
(16210, 7)


In [60]:
## Rewriting files
nsc_posts.to_csv(path_or_buf = '../Reddit Data/nsc_posts_clean.csv')
nsc_comments.to_csv(path_or_buf = '../Reddit Data/nsc_comments_clean.csv')
tt_posts.to_csv(path_or_buf = '../Reddit Data/tt_posts_clean.csv')
tt_comments.to_csv(path_or_buf = '../Reddit Data/tt_comments_clean.csv')
preds_posts.to_csv(path_or_buf = '../Reddit Data/preds_posts_clean.csv')
preds_comments.to_csv(path_or_buf = '../Reddit Data/preds_comments_clean.csv')