# Reddit Data Pull
> This notebook will be used to pull all of the data we will be using from Reddit. The notebook should only be run once and run time will need to be split up to not exceed the Reddit API query limits.

### Load libraries and set credentials

In [1]:
import praw
import pandas as pd
from datetime import datetime

reddit = praw.Reddit(
    client_id='26fJ1hARXGStkyQSFA86qA',
    client_secret='3ABsV91kHoBuz2O7wGNI6Fdb2CFN8w',
    user_agent='Capstone_Project (by /u/kitconnelly)',
    username='kitconnelly',
    password='Yankees44$'
)

In [2]:
# Pull comments function
def capture_comments(comment, comments_list, post_id, parent_id=None):

    timestamp = datetime.utcfromtimestamp(comment.created_utc)

    comments_list.append({
        'Comment ID': comment.id,
        'Parent Comment ID': parent_id,  # Store the parent comment ID
        'Text': comment.body,
        'Author': comment.author.name if comment.author else 'Unknown',
        'Date': timestamp.strftime('%Y-%m-%d %H:%M:%S'),
        'Post ID': post_id
    })
    
    for reply in comment.replies:
        capture_comments(reply, comments_list, post_id, parent_id=comment.id)  # Pass the parent comment ID

## Subreddit - NashvilleSC
- For the specified subreddit, the last 1000 posts (API limit) (eventually scaled up to this) will be pulled for each possible category.
- Each individual pull will be stored in a dataframe incase an individual category (such as recent posts) is desired to be researched further.
- At the end of the section, two large dataframes will be made for the subreddit (one for posts and one for comments).
- These data frames can then be cleaned (removing '\n' and emojis) as well as have duplicate rows removed (based on post id for posts, combo of comment id's other).

NOTE: Matchday threads by SportsThreadderBot will need to be filtered out

#### Top Posts

In [3]:
subreddit = reddit.subreddit("NashvilleSC")
posts = subreddit.top(limit=10)

# Initialize a list to store comments for all top posts
all_comments = []

# List for post data frame
ids = []
titles = []
contents = []
authors = []
post_dates = []

# Iterate through the top posts
for post in posts:
    post.comments.replace_more(limit=None)  # Retrieve all comments including hidden ones

    post_id = post.id

    ids.append(post_id)
    titles.append(post.title)
    contents.append(post.selftext)
    authors.append(post.author.name if post.author else 'Unknown')
    post_dates.append(datetime.utcfromtimestamp(post.created_utc).strftime('%Y-%m-%d %H:%M:%S'))
    
    # Capture comments for each post
    for top_level_comment in post.comments:
        capture_comments(top_level_comment, all_comments, post_id)

# Create a Pandas DataFrame from the flattened comments data
nsc_top_comments_df = pd.DataFrame(all_comments)
nsc_top_posts_dict = {'id': ids, 'Title': titles, 'Content': contents, 'Author': authors, 'Post Date': post_dates}
nsc_top_posts_df = pd.DataFrame(nsc_top_posts_dict)

#### New posts

In [4]:
posts = subreddit.new(limit=10)

# Initialize a list to store comments for all top posts
all_comments = []

# List for post data frame
ids = []
titles = []
contents = []
authors = []
post_dates = []

# Iterate through the top posts
for post in posts:
    post.comments.replace_more(limit=None)  # Retrieve all comments including hidden ones

    post_id = post.id

    ids.append(post_id)
    titles.append(post.title)
    contents.append(post.selftext)
    authors.append(post.author.name if post.author else 'Unknown')
    post_dates.append(datetime.utcfromtimestamp(post.created_utc).strftime('%Y-%m-%d %H:%M:%S'))
    
    # Capture comments for each post
    for top_level_comment in post.comments:
        capture_comments(top_level_comment, all_comments, post_id)

# Create a Pandas DataFrame from the flattened comments data
nsc_new_comments_df = pd.DataFrame(all_comments)
nsc_new_posts_dict = {'id': ids, 'Title': titles, 'Content': contents, 'Author': authors, 'Post Date': post_dates}
nsc_new_posts_df = pd.DataFrame(nsc_new_posts_dict)

#### Hot posts

In [5]:
posts = subreddit.hot(limit=10)

# Initialize a list to store comments for all top posts
all_comments = []

# List for post data frame
ids = []
titles = []
contents = []
authors = []
post_dates = []

# Iterate through the top posts
for post in posts:
    post.comments.replace_more(limit=None)  # Retrieve all comments including hidden ones

    post_id = post.id

    ids.append(post_id)
    titles.append(post.title)
    contents.append(post.selftext)
    authors.append(post.author.name if post.author else 'Unknown')
    post_dates.append(datetime.utcfromtimestamp(post.created_utc).strftime('%Y-%m-%d %H:%M:%S'))
    
    # Capture comments for each post
    for top_level_comment in post.comments:
        capture_comments(top_level_comment, all_comments, post_id)

# Create a Pandas DataFrame from the flattened comments data
nsc_hot_comments_df = pd.DataFrame(all_comments)
nsc_hot_posts_dict = {'id': ids, 'Title': titles, 'Content': contents, 'Author': authors, 'Post Date': post_dates}
nsc_hot_posts_df = pd.DataFrame(nsc_hot_posts_dict)

#### Rising posts

In [6]:
posts = subreddit.rising(limit=10)

# Initialize a list to store comments for all top posts
all_comments = []

# List for post data frame
ids = []
titles = []
contents = []
authors = []
post_dates = []

# Iterate through the top posts
for post in posts:
    post.comments.replace_more(limit=None)  # Retrieve all comments including hidden ones

    post_id = post.id

    ids.append(post_id)
    titles.append(post.title)
    contents.append(post.selftext)
    authors.append(post.author.name if post.author else 'Unknown')
    post_dates.append(datetime.utcfromtimestamp(post.created_utc).strftime('%Y-%m-%d %H:%M:%S'))
    
    # Capture comments for each post
    for top_level_comment in post.comments:
        capture_comments(top_level_comment, all_comments, post_id)

# Create a Pandas DataFrame from the flattened comments data
nsc_rising_comments_df = pd.DataFrame(all_comments)
nsc_rising_posts_dict = {'id': ids, 'Title': titles, 'Content': contents, 'Author': authors, 'Post Date': post_dates}
nsc_rising_posts_df = pd.DataFrame(nsc_rising_posts_dict)

#### Controversial Posts

In [7]:
posts = subreddit.controversial(limit=10)

# Initialize a list to store comments for all top posts
all_comments = []

# List for post data frame
ids = []
titles = []
contents = []
authors = []
post_dates = []

# Iterate through the top posts
for post in posts:
    post.comments.replace_more(limit=None)  # Retrieve all comments including hidden ones

    post_id = post.id

    ids.append(post_id)
    titles.append(post.title)
    contents.append(post.selftext)
    authors.append(post.author.name if post.author else 'Unknown')
    post_dates.append(datetime.utcfromtimestamp(post.created_utc).strftime('%Y-%m-%d %H:%M:%S'))
    
    # Capture comments for each post
    for top_level_comment in post.comments:
        capture_comments(top_level_comment, all_comments, post_id)

# Create a Pandas DataFrame from the flattened comments data
nsc_controversial_comments_df = pd.DataFrame(all_comments)
nsc_controversial_posts_dict = {'id': ids, 'Title': titles, 'Content': contents, 'Author': authors, 'Post Date': post_dates}
nsc_controversial_posts_df = pd.DataFrame(nsc_controversial_posts_dict)

#### Concatenating into 1 data frame

In [8]:
nsc_posts = pd.concat([nsc_top_posts_df, nsc_new_posts_df, nsc_hot_posts_df, nsc_rising_posts_df, nsc_controversial_posts_df])
nsc_comments = pd.concat([nsc_top_comments_df, nsc_new_comments_df, nsc_hot_comments_df, nsc_rising_comments_df, nsc_controversial_comments_df])

In [9]:
nsc_posts.shape, nsc_comments.shape

((50, 5), (1190, 6))

## Subreddit - Tennesseetitans
- For the specified subreddit, the last 1000 posts (API limit) (eventually scaled up to this) will be pulled for each possible category.
- Each individual pull will be stored in a dataframe incase an individual category (such as recent posts) is desired to be researched further.
- At the end of the section, two large dataframes will be made for the subreddit (one for posts and one for comments).
- These data frames can then be cleaned (removing '\n' and emojis) as well as have duplicate rows removed (based on post id for posts, combo of comment id's other).

NOTE: Gameday threads posted by Titans_Mod will need to be filtered out before pulling

#### Top Posts

In [10]:
subreddit = reddit.subreddit("Tennesseetitans")
posts = subreddit.top(limit=10)

# Initialize a list to store comments for all top posts
all_comments = []

# List for post data frame
ids = []
titles = []
contents = []
authors = []
post_dates = []

# Iterate through the top posts
for post in posts:
    post.comments.replace_more(limit=None)  # Retrieve all comments including hidden ones

    post_id = post.id

    ids.append(post_id)
    titles.append(post.title)
    contents.append(post.selftext)
    authors.append(post.author.name if post.author else 'Unknown')
    post_dates.append(datetime.utcfromtimestamp(post.created_utc).strftime('%Y-%m-%d %H:%M:%S'))
    
    # Capture comments for each post
    for top_level_comment in post.comments:
        capture_comments(top_level_comment, all_comments, post_id)

# Create a Pandas DataFrame from the flattened comments data
tt_top_comments_df = pd.DataFrame(all_comments)
tt_top_posts_dict = {'id': ids, 'Title': titles, 'Content': contents, 'Author': authors, 'Post Date': post_dates}
tt_top_posts_df = pd.DataFrame(tt_top_posts_dict)

#### New Posts

In [11]:
posts = subreddit.new(limit=10)

# Initialize a list to store comments for all top posts
all_comments = []

# List for post data frame
ids = []
titles = []
contents = []
authors = []
post_dates = []

# Iterate through the top posts
for post in posts:
    post.comments.replace_more(limit=None)  # Retrieve all comments including hidden ones

    post_id = post.id

    ids.append(post_id)
    titles.append(post.title)
    contents.append(post.selftext)
    authors.append(post.author.name if post.author else 'Unknown')
    post_dates.append(datetime.utcfromtimestamp(post.created_utc).strftime('%Y-%m-%d %H:%M:%S'))
    
    # Capture comments for each post
    for top_level_comment in post.comments:
        capture_comments(top_level_comment, all_comments, post_id)

# Create a Pandas DataFrame from the flattened comments data
tt_new_comments_df = pd.DataFrame(all_comments)
tt_new_posts_dict = {'id': ids, 'Title': titles, 'Content': contents, 'Author': authors, 'Post Date': post_dates}
tt_new_posts_df = pd.DataFrame(tt_new_posts_dict)

#### Hot Posts

In [12]:
posts = subreddit.hot(limit=10)

# Initialize a list to store comments for all top posts
all_comments = []

# List for post data frame
ids = []
titles = []
contents = []
authors = []
post_dates = []

# Iterate through the top posts
for post in posts:
    post.comments.replace_more(limit=None)  # Retrieve all comments including hidden ones

    post_id = post.id

    ids.append(post_id)
    titles.append(post.title)
    contents.append(post.selftext)
    authors.append(post.author.name if post.author else 'Unknown')
    post_dates.append(datetime.utcfromtimestamp(post.created_utc).strftime('%Y-%m-%d %H:%M:%S'))
    
    # Capture comments for each post
    for top_level_comment in post.comments:
        capture_comments(top_level_comment, all_comments, post_id)

# Create a Pandas DataFrame from the flattened comments data
tt_hot_comments_df = pd.DataFrame(all_comments)
tt_hot_posts_dict = {'id': ids, 'Title': titles, 'Content': contents, 'Author': authors, 'Post Date': post_dates}
tt_hot_posts_df = pd.DataFrame(tt_hot_posts_dict)

#### Rising Posts

In [13]:
posts = subreddit.rising(limit=10)

# Initialize a list to store comments for all top posts
all_comments = []

# List for post data frame
ids = []
titles = []
contents = []
authors = []
post_dates = []

# Iterate through the top posts
for post in posts:
    post.comments.replace_more(limit=None)  # Retrieve all comments including hidden ones

    post_id = post.id

    ids.append(post_id)
    titles.append(post.title)
    contents.append(post.selftext)
    authors.append(post.author.name if post.author else 'Unknown')
    post_dates.append(datetime.utcfromtimestamp(post.created_utc).strftime('%Y-%m-%d %H:%M:%S'))
    
    # Capture comments for each post
    for top_level_comment in post.comments:
        capture_comments(top_level_comment, all_comments, post_id)

# Create a Pandas DataFrame from the flattened comments data
tt_rising_comments_df = pd.DataFrame(all_comments)
tt_rising_posts_dict = {'id': ids, 'Title': titles, 'Content': contents, 'Author': authors, 'Post Date': post_dates}
tt_rising_posts_df = pd.DataFrame(tt_rising_posts_dict)

#### Controversial Posts

In [14]:
posts = subreddit.controversial(limit=10)

# Initialize a list to store comments for all top posts
all_comments = []

# List for post data frame
ids = []
titles = []
contents = []
authors = []
post_dates = []

# Iterate through the top posts
for post in posts:
    post.comments.replace_more(limit=None)  # Retrieve all comments including hidden ones

    post_id = post.id

    ids.append(post_id)
    titles.append(post.title)
    contents.append(post.selftext)
    authors.append(post.author.name if post.author else 'Unknown')
    post_dates.append(datetime.utcfromtimestamp(post.created_utc).strftime('%Y-%m-%d %H:%M:%S'))
    
    # Capture comments for each post
    for top_level_comment in post.comments:
        capture_comments(top_level_comment, all_comments, post_id)

# Create a Pandas DataFrame from the flattened comments data
tt_controversial_comments_df = pd.DataFrame(all_comments)
tt_controversial_posts_dict = {'id': ids, 'Title': titles, 'Content': contents, 'Author': authors, 'Post Date': post_dates}
tt_controversial_posts_df = pd.DataFrame(tt_controversial_posts_dict)

#### Concatenating into 1 data frame

In [15]:
tt_posts = pd.concat([tt_top_posts_df, tt_new_posts_df, tt_hot_posts_df, tt_rising_posts_df, tt_controversial_posts_df])
tt_comments = pd.concat([tt_top_comments_df, tt_new_comments_df, tt_hot_comments_df, tt_rising_comments_df, tt_controversial_comments_df])

In [16]:
tt_posts.shape, tt_comments.shape

((50, 5), (5918, 6))

## Subreddit - Predators
- For the specified subreddit, the last 1000 posts (API limit) (eventually scaled up to this) will be pulled for each possible category.
- Each individual pull will be stored in a dataframe incase an individual category (such as recent posts) is desired to be researched further.
- At the end of the section, two large dataframes will be made for the subreddit (one for posts and one for comments).
- These data frames can then be cleaned (removing '\n' and emojis) as well as have duplicate rows removed (based on post id for posts, combo of comment id's other).

#### Top Posts

In [17]:
subreddit = reddit.subreddit("Predators")
posts = subreddit.top(limit=10)

# Initialize a list to store comments for all top posts
all_comments = []

# List for post data frame
ids = []
titles = []
contents = []
authors = []
post_dates = []

# Iterate through the top posts
for post in posts:
    post.comments.replace_more(limit=None)  # Retrieve all comments including hidden ones

    post_id = post.id

    ids.append(post_id)
    titles.append(post.title)
    contents.append(post.selftext)
    authors.append(post.author.name if post.author else 'Unknown')
    post_dates.append(datetime.utcfromtimestamp(post.created_utc).strftime('%Y-%m-%d %H:%M:%S'))
    
    # Capture comments for each post
    for top_level_comment in post.comments:
        capture_comments(top_level_comment, all_comments, post_id)

# Create a Pandas DataFrame from the flattened comments data
preds_top_comments_df = pd.DataFrame(all_comments)
preds_top_posts_dict = {'id': ids, 'Title': titles, 'Content': contents, 'Author': authors, 'Post Date': post_dates}
preds_top_posts_df = pd.DataFrame(preds_top_posts_dict)

#### New Posts

In [18]:
posts = subreddit.new(limit=10)

# Initialize a list to store comments for all top posts
all_comments = []

# List for post data frame
ids = []
titles = []
contents = []
authors = []
post_dates = []

# Iterate through the top posts
for post in posts:
    post.comments.replace_more(limit=None)  # Retrieve all comments including hidden ones

    post_id = post.id

    ids.append(post_id)
    titles.append(post.title)
    contents.append(post.selftext)
    authors.append(post.author.name if post.author else 'Unknown')
    post_dates.append(datetime.utcfromtimestamp(post.created_utc).strftime('%Y-%m-%d %H:%M:%S'))
    
    # Capture comments for each post
    for top_level_comment in post.comments:
        capture_comments(top_level_comment, all_comments, post_id)

# Create a Pandas DataFrame from the flattened comments data
preds_new_comments_df = pd.DataFrame(all_comments)
preds_new_posts_dict = {'id': ids, 'Title': titles, 'Content': contents, 'Author': authors, 'Post Date': post_dates}
preds_new_posts_df = pd.DataFrame(preds_new_posts_dict)

#### Hot Posts

In [19]:
posts = subreddit.hot(limit=10)

# Initialize a list to store comments for all top posts
all_comments = []

# List for post data frame
ids = []
titles = []
contents = []
authors = []
post_dates = []

# Iterate through the top posts
for post in posts:
    post.comments.replace_more(limit=None)  # Retrieve all comments including hidden ones

    post_id = post.id

    ids.append(post_id)
    titles.append(post.title)
    contents.append(post.selftext)
    authors.append(post.author.name if post.author else 'Unknown')
    post_dates.append(datetime.utcfromtimestamp(post.created_utc).strftime('%Y-%m-%d %H:%M:%S'))
    
    # Capture comments for each post
    for top_level_comment in post.comments:
        capture_comments(top_level_comment, all_comments, post_id)

# Create a Pandas DataFrame from the flattened comments data
preds_hot_comments_df = pd.DataFrame(all_comments)
preds_hot_posts_dict = {'id': ids, 'Title': titles, 'Content': contents, 'Author': authors, 'Post Date': post_dates}
preds_hot_posts_df = pd.DataFrame(preds_hot_posts_dict)

#### Rising Posts

In [20]:
posts = subreddit.rising(limit=10)

# Initialize a list to store comments for all top posts
all_comments = []

# List for post data frame
ids = []
titles = []
contents = []
authors = []
post_dates = []

# Iterate through the top posts
for post in posts:
    post.comments.replace_more(limit=None)  # Retrieve all comments including hidden ones

    post_id = post.id

    ids.append(post_id)
    titles.append(post.title)
    contents.append(post.selftext)
    authors.append(post.author.name if post.author else 'Unknown')
    post_dates.append(datetime.utcfromtimestamp(post.created_utc).strftime('%Y-%m-%d %H:%M:%S'))
    
    # Capture comments for each post
    for top_level_comment in post.comments:
        capture_comments(top_level_comment, all_comments, post_id)

# Create a Pandas DataFrame from the flattened comments data
preds_rising_comments_df = pd.DataFrame(all_comments)
preds_rising_posts_dict = {'id': ids, 'Title': titles, 'Content': contents, 'Author': authors, 'Post Date': post_dates}
preds_rising_posts_df = pd.DataFrame(preds_rising_posts_dict)

#### Controversial Posts

In [21]:
posts = subreddit.controversial(limit=10)

# Initialize a list to store comments for all top posts
all_comments = []

# List for post data frame
ids = []
titles = []
contents = []
authors = []
post_dates = []

# Iterate through the top posts
for post in posts:
    post.comments.replace_more(limit=None)  # Retrieve all comments including hidden ones

    post_id = post.id

    ids.append(post_id)
    titles.append(post.title)
    contents.append(post.selftext)
    authors.append(post.author.name if post.author else 'Unknown')
    post_dates.append(datetime.utcfromtimestamp(post.created_utc).strftime('%Y-%m-%d %H:%M:%S'))
    
    # Capture comments for each post
    for top_level_comment in post.comments:
        capture_comments(top_level_comment, all_comments, post_id)

# Create a Pandas DataFrame from the flattened comments data
preds_controversial_comments_df = pd.DataFrame(all_comments)
preds_controversial_posts_dict = {'id': ids, 'Title': titles, 'Content': contents, 'Author': authors, 'Post Date': post_dates}
preds_controversial_posts_df = pd.DataFrame(preds_controversial_posts_dict)

#### Concatenating into 1 data frame

In [22]:
preds_posts = pd.concat([preds_top_posts_df, preds_new_posts_df, preds_hot_posts_df, preds_rising_posts_df, preds_controversial_posts_df])
preds_comments = pd.concat([preds_top_comments_df, preds_new_comments_df, preds_hot_comments_df, preds_rising_comments_df, preds_controversial_comments_df])

In [23]:
preds_posts.shape, preds_comments.shape

((50, 5), (1876, 6))

In [24]:
nsc_posts.head()

Unnamed: 0,id,Title,Content,Author,Post Date
0,ut4efu,I played the guitar riff yesterday!,,Grace-Music,2022-05-19 14:16:19
1,n3x5zx,Took my daughter to her first MLS game yesterd...,,JiuManji,2021-05-03 14:15:03
2,jy4kdq,FIRST PLAYOFF WIN UPVOTE PARTY!!,LETS GO!! What a Game!!!,BigBlueNate33,2020-11-21 04:26:24
3,fbuyj5,Thought this guy deserved a shoutout,,fullthrottle13,2020-03-01 14:44:32
4,k0isnq,ANOTHER PLAYOFF WIN UPVOTE PARTY!!,LETS. FREAKING. GO!!!!!!! MASSIVE CLUB!!! Semi...,BigBlueNate33,2020-11-25 01:56:02


In [25]:
nsc_comments.head()

Unnamed: 0,Comment ID,Parent Comment ID,Text,Author,Date,Post ID
0,i97ewgh,,It was fantastic! You crushed it!,DirtyFlip,2022-05-19 14:26:04,ut4efu
1,i97eye1,,Sounded great!,Cerebralflea,2022-05-19 14:26:27,ut4efu
2,i97fvs9,,you did a great job!!,trillwilly69,2022-05-19 14:33:10,ut4efu
3,i97llqq,,Amazing job last night. Hope you can come bac...,jasonlp03,2022-05-19 15:13:11,ut4efu
4,i97o17n,i97llqq,Me too!,Grace-Music,2022-05-19 15:29:57,ut4efu


### Writing to CSV File

In [27]:
nsc_posts.to_csv(path_or_buf = '../Reddit Data/nsc_posts.csv')
nsc_comments.to_csv(path_or_buf = '../Reddit Data/nsc_comments.csv')
tt_posts.to_csv(path_or_buf = '../Reddit Data/tt_posts.csv')
tt_comments.to_csv(path_or_buf = '../Reddit Data/tt_comments.csv')
preds_posts.to_csv(path_or_buf = '../Reddit Data/preds_posts.csv')
preds_comments.to_csv(path_or_buf = '../Reddit Data/preds_comments.csv')