In [41]:
import pandas as pd

In [42]:
df = pd.read_csv('data/top_reddit_posts.csv')
df.info()
df.head()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 6947 entries, 0 to 6946
Data columns (total 14 columns):
 #   Column               Non-Null Count  Dtype  
---  ------               --------------  -----  
 0   Unnamed: 0           6947 non-null   int64  
 1   Title                6947 non-null   object 
 2   Author               6943 non-null   object 
 3   Subreddit            6947 non-null   object 
 4   Score                6947 non-null   int64  
 5   Permalink            6947 non-null   object 
 6   Creation Time        6947 non-null   float64
 7   Number of Comments   6947 non-null   int64  
 8   Upvote Ratio         6947 non-null   float64
 9   URL                  6947 non-null   object 
 10  Post ID              6947 non-null   object 
 11  Is Original Content  6947 non-null   bool   
 12  Flair                4034 non-null   object 
 13  Comments             6947 non-null   object 
dtypes: bool(1), float64(2), int64(3), object(8)
memory usage: 712.5+ KB


Unnamed: 0.1,Unnamed: 0,Title,Author,Subreddit,Score,Permalink,Creation Time,Number of Comments,Upvote Ratio,URL,Post ID,Is Original Content,Flair,Comments
0,0,"What's the worst possible reply to ""I'm pregna...",EmmanuelMoyta,AskReddit,15601,/r/AskReddit/comments/14cmu88/whats_the_worst_...,1687101000.0,15474,0.87,https://www.reddit.com/r/AskReddit/comments/14...,14cmu88,False,,"[{'Comment ID': 'jolp2kq', 'Comment Author': '..."
1,1,"People who work at super fancy hotels, what ki...",akumamatata8080,AskReddit,12789,/r/AskReddit/comments/14cztbf/people_who_work_...,1687135000.0,3340,0.95,https://www.reddit.com/r/AskReddit/comments/14...,14cztbf,False,,"[{'Comment ID': 'joo2xg6', 'Comment Author': '..."
2,2,"People who have 4+ kids, why?",GabesterMc,AskReddit,11689,/r/AskReddit/comments/14cqute/people_who_have_...,1687111000.0,4457,0.84,https://www.reddit.com/r/AskReddit/comments/14...,14cqute,False,,"[{'Comment ID': 'jon5qdw', 'Comment Author': '..."
3,3,What's the most fucked up drama from somewhere...,GransShortbread,AskReddit,8814,/r/AskReddit/comments/14coqqm/whats_the_most_f...,1687106000.0,3889,0.95,https://www.reddit.com/r/AskReddit/comments/14...,14coqqm,False,,"[{'Comment ID': 'jommvy3', 'Comment Author': '..."
4,4,If your partner came home and told you they ac...,Alternative-Cod8891,AskReddit,4876,/r/AskReddit/comments/14ctccu/if_your_partner_...,1687118000.0,2422,0.95,https://www.reddit.com/r/AskReddit/comments/14...,14ctccu,False,,"[{'Comment ID': 'jonga3u', 'Comment Author': '..."


In [43]:
# drop the index column
df = df.drop(columns='Unnamed: 0')

# format the column names in snake cases
df.columns = df.columns.str.lower()
df.columns = df.columns.str.replace(' ', '_')

# all posts are not original, the is_original_content column is meaningless and can be dropped
print(df.is_original_content.describe())

# drop columns not needed: author, permalink, url, is_original_content
df = df.drop(columns=['author', 'permalink', 'url', 'is_original_content'])

# convert dtype of text columns to string from object
columns = ['title', 'subreddit', 'flair', 'post_id']
df[columns] = df[columns].astype(str)

# creation time is 10-digit integer corresponds to a unit of second
# convert unix timestamp to datetime64
df['creation_time'] = pd.to_datetime(df['creation_time'], unit='s')

# convert subreddit to categorical data type
df['subreddit'] = df['subreddit'].astype('category')

# re-order the columns
df = df[['post_id', 'subreddit', 'title', 'flair', 'comments', 'score', 'creation_time', 'number_of_comments', 'upvote_ratio']]

df.info()

count      6947
unique        1
top       False
freq       6947
Name: is_original_content, dtype: object
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 6947 entries, 0 to 6946
Data columns (total 9 columns):
 #   Column              Non-Null Count  Dtype         
---  ------              --------------  -----         
 0   post_id             6947 non-null   object        
 1   subreddit           6947 non-null   category      
 2   title               6947 non-null   object        
 3   flair               6947 non-null   object        
 4   comments            6947 non-null   object        
 5   score               6947 non-null   int64         
 6   creation_time       6947 non-null   datetime64[ns]
 7   number_of_comments  6947 non-null   int64         
 8   upvote_ratio        6947 non-null   float64       
dtypes: category(1), datetime64[ns](1), float64(1), int64(2), object(4)
memory usage: 441.5+ KB


In [44]:
# truncate the data to start from 2023-07-15 to make sure the data can be compressed to under 100MB
df = df.loc[df['creation_time'] >= '2023-07-15']

In [45]:
def eval_comments(x: pd.Series):
    # convert a string of list of dictionaries to list of dictionaries
    comment_list = eval(x)
    
    converted_list = []
    
    # Convert keys to lowercase and replace spaces with underscores
    for d in comment_list:
        new_d = {}
        for key, value in d.items():
            # format keys in snake case
            new_key = key.lower().replace(" ", "_")
            new_d[new_key] = value
        
        converted_list.append(new_d)
        
    return converted_list
    
# convert dtype of comments column from string to a list of dictionaries
df['comments'] = df['comments'].apply(func=eval_comments)

In [46]:
# convert the df from "wide" to long format by exploding the comments column
# Step 1: Explode the `comments` column so each dictionary within the list is in a separate row
df = df.explode('comments', ignore_index=True)

# Step 2: Normalize the dictionary in the `comments` column to extract specific fields
df_comments = pd.json_normalize(df['comments'])

In [47]:
# df_posts has a primary key, post_id
df_posts = df.drop(columns="comments").drop_duplicates().reset_index(drop=True)

df_posts.info()

# concate columns (post_id, comment_content, comment_score, comment_created_utc) to create the comment df
# df_comments have a foreign key, post_id, related to the primary key in df_posts
df_comments = pd.concat(
    [
        df["post_id"],
        df_comments[["comment_content", "comment_score", "comment_created_utc"]],
    ],
    axis=1,
)

# format types for columns
df_comments['comment_content'] = df_comments['comment_content'].astype(str)
df_comments['comment_score'] = df_comments['comment_score'].astype('Int64')
df_comments['comment_created_utc'] = pd.to_datetime(df_comments['comment_created_utc'], unit='s')

df_comments.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4275 entries, 0 to 4274
Data columns (total 8 columns):
 #   Column              Non-Null Count  Dtype         
---  ------              --------------  -----         
 0   post_id             4275 non-null   object        
 1   subreddit           4275 non-null   category      
 2   title               4275 non-null   object        
 3   flair               4275 non-null   object        
 4   score               4275 non-null   int64         
 5   creation_time       4275 non-null   datetime64[ns]
 6   number_of_comments  4275 non-null   int64         
 7   upvote_ratio        4275 non-null   float64       
dtypes: category(1), datetime64[ns](1), float64(1), int64(2), object(3)
memory usage: 238.5+ KB
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 983153 entries, 0 to 983152
Data columns (total 4 columns):
 #   Column               Non-Null Count   Dtype         
---  ------               --------------   -----         
 0   post_id

In [48]:
df_posts.head()

Unnamed: 0,post_id,subreddit,title,flair,score,creation_time,number_of_comments,upvote_ratio
0,1501ibb,AskReddit,"What Worst possible reply to ""I'm pregnant""?",,11202,2023-07-15 04:06:56,12625,0.85
1,14zz798,AskReddit,What is a personal story you have that you don...,,6607,2023-07-15 02:12:11,2891,0.97
2,14zx7iq,AskReddit,How did that person in your class become rich?,,4251,2023-07-15 00:38:28,2542,0.94
3,150aarj,AskReddit,What is the worst reply to “I’m leaving you”?,,3608,2023-07-15 12:03:43,3564,0.89
4,150j9g1,AskReddit,Who's the Most Dangerous Human Alive Right Now?,,2287,2023-07-15 18:28:51,3389,0.66


In [49]:
df_comments.head()

Unnamed: 0,post_id,comment_content,comment_score,comment_created_utc
0,1501ibb,Did you get a second opinion?,6894,2023-07-15 04:18:00
1,1501ibb,On purpose?,15404,2023-07-15 04:21:36
2,1501ibb,"""Why does this keep happening to me?! Second t...",11788,2023-07-15 04:41:47
3,1501ibb,“Is it yours?”,24645,2023-07-15 04:35:52
4,1501ibb,Congrats! Whose is it?,8216,2023-07-15 04:07:51


In [50]:
df_posts.to_parquet(path='data/posts.parquet.brotli', compression='brotli', index=False)
df_comments.to_parquet(path='data/comments.parquet.brotli', compression='brotli', index=False)