# Test Pipeline
> This notebook will be used to explore how to parse comments in the structure we want them, as well as set up an example pipeline that can be sized up later in the project.

In [1]:
import praw
import pandas as pd
from datetime import datetime

reddit = praw.Reddit(
    client_id='26fJ1hARXGStkyQSFA86qA',
    client_secret='3ABsV91kHoBuz2O7wGNI6Fdb2CFN8w',
    user_agent='Capstone_Project (by /u/kitconnelly)',
    username='kitconnelly'
)

In [27]:
subreddit = reddit.subreddit("NashvilleSC")
posts = subreddit.top(limit=10)

In [28]:
# Define a function to recursively capture comments
def capture_comments(comment, comments_list, post_id, parent_id=None):
    comments_list.append({
        'Comment ID': comment.id,
        'Parent Comment ID': parent_id,  # Store the parent comment ID
        'Text': comment.body,
        'Author': comment.author.name if comment.author else 'Unknown',
        'Post ID': post_id
    })
    for reply in comment.replies:
        capture_comments(reply, comments_list, post_id, parent_id=comment.id)  # Pass the parent comment ID

# Initialize a list to store comments for all top posts
all_comments = []

# List for post data frame
ids = []
titles = []
contents = []
authors = []
post_dates = []

# Iterate through the top posts
for post in posts:
    post.comments.replace_more(limit=None)  # Retrieve all comments including hidden ones

    post_id = post.id

    ids.append(post.id)
    titles.append(post.title)
    contents.append(post.selftext)
    authors.append(post.author.name if post.author else 'Unknown')
    post_dates.append(datetime.utcfromtimestamp(post.created_utc).strftime('%Y-%m-%d %H:%M:%S'))
    
    # Capture comments for each post
    for top_level_comment in post.comments:
        capture_comments(top_level_comment, all_comments, post_id)

# Create a Pandas DataFrame from the flattened comments data
comments_df = pd.DataFrame(all_comments)
posts_dict = {'id': ids, 'Title': titles, 'Content': contents, 'Author': authors, 'Post Date': post_dates}
posts_df = pd.DataFrame(posts_dict)

In [31]:
# Display the Comments DataFrame
comments_df.head(10)

Unnamed: 0,Comment ID,Parent Comment ID,Text,Author,Post ID
0,i97ewgh,,It was fantastic! You crushed it!,DirtyFlip,ut4efu
1,i97eye1,,Sounded great!,Cerebralflea,ut4efu
2,i97fvs9,,you did a great job!!,trillwilly69,ut4efu
3,i97llqq,,Amazing job last night. Hope you can come bac...,jasonlp03,ut4efu
4,i97o17n,i97llqq,Me too!,Grace-Music,ut4efu
5,i97gs0n,,Great job!!!,AggieinTN,ut4efu
6,i97gegb,,Badass,R-Smelly,ut4efu
7,i97hpil,,Sounded awesome!!,BigBlueNate33,ut4efu
8,i97rdeh,,Absolutely killed it also!\nWhere do you pull ...,Cam_man518,ut4efu
9,i97tznu,,Great job!,danrydel,ut4efu


In [32]:
# Display the Posts DataFrame
posts_df.head(10)

Unnamed: 0,id,Title,Content,Author,Post Date
0,ut4efu,I played the guitar riff yesterday!,,Grace-Music,2022-05-19 14:16:19
1,n3x5zx,Took my daughter to her first MLS game yesterd...,,JiuManji,2021-05-03 14:15:03
2,jy4kdq,FIRST PLAYOFF WIN UPVOTE PARTY!!,LETS GO!! What a Game!!!,BigBlueNate33,2020-11-21 04:26:24
3,fbuyj5,Thought this guy deserved a shoutout,,fullthrottle13,2020-03-01 14:44:32
4,k0isnq,ANOTHER PLAYOFF WIN UPVOTE PARTY!!,LETS. FREAKING. GO!!!!!!! MASSIVE CLUB!!! Semi...,BigBlueNate33,2020-11-25 01:56:02
5,irq1so,NASHVILLE SC 4-2 WIN UPVOTE PARTY!!!,¡CHALUPAS PARA TODOS!,SteveHeaves,2020-09-13 02:29:59
6,yjf55t,[Nashville SC] Hany Mukhtar is the 2022 Landon...,,pasoud,2022-11-01 17:33:09
7,n7wozu,FIRST WIN OF THE SEASON UPVOTE PARTY!,First of hopefully many!!!,BigBlueNate33,2021-05-08 19:34:17
8,uns6do,Call the ambulance,,copjon,2022-05-12 03:57:40
9,jk1yux,Nashville SC officially going to the playoffs!,So proud of our boys in gold clinching a playo...,johnyates,2020-10-29 02:33:22
