Importing the needed libraries and setting up the reddit scraper

In [None]:
import praw
import pandas as pd
from datetime import datetime
import os
from dotenv import load_dotenv

In [27]:
# loading praw api keys and storing them as local variables
load_dotenv()

# creating a read only reddit scraper
agent = praw.Reddit(client_id = os.getenv("praw_client_ID"), client_secret = os.getenv("praw_secret"), user_agent = os.getenv("praw_user_agent"))

Getting all of the serviceable NFL team subreddits to get post-game threads.

In [None]:
# defining the search criteria
subreddits = ['CHIBears', 'cowboys', 'eagles', 'GreenBayPackers', 'Seahawks', '49ers',
            'LosAngelesRams', 'steelers', 'Browns', 'DenverBroncos', 'NYGiants', 'KansasCityChiefs',
            'falcons','Saints', 'bengals', 'panthers', 'Texans', 'ravens', 'Commanders',
            'raiders', 'nyjets', 'Colts', 'AZCardinals', 'Chargers']
teams = ['Bears', 'Cowboys', 'Eagles', 'Packers', 'Seahawks', '49ers', 'Rams', 'Steelers', 'Browns',
         'Broncos', 'Giants', 'Chiefs', 'Falcons', 'Saints', 'Bengals', 'Panthers', 'Texans', 'Ravens',
         'Commanders', 'Raiders', 'Jets', 'Colts', 'Cardinals', 'Chargers']
# Teams not included because their subreddits do not have an explicit post game thread: Vikings, Bills, Titans, Buccaneers
# Teams excluded because < 17 post game threads: Lions, Dolphins, Jaguars
search_query = 'post-gamethread' # subject to change for different subreddits
sort_order = "new"

season_end = datetime(2025, 1, 7) # allowing buffer for the threads to posted a few days before or after games
season_start = datetime(2024, 9, 3)

# defining storage dictionary for team threads
team_thread_dict = {}
for i in range(len(subreddits)):
    team = subreddits[i]
    subreddit = agent.subreddit(team) 
    if team in ['cowboys', 'eagles', 'Seahawks', '49ers', 'LosAngelesRams', 'Steelers', 'Browns', 'DenverBroncos', 'NYGiants',
                'KansasCityChiefs', 'falcons', 'Saints', 'Texans', 'Commanders', 'Colts']:
        search_query = 'Game Thread'
    if team in ['GreenBayPackers', 'panthers', 'ravens', 'raiders', 'nyjets', 'AZCardinals', 'Chargers']:
        search_query = 'Post-Game Thread'
    if team in ['bengals']:
        search_query = 'post game thread'
    # defining dict to store thread ids
    thread_dict = {}
    game_number = 17
    for submission in subreddit.search(query = search_query, sort = sort_order, limit = None):
        # getting the date of the post
        timestamp = submission.created_utc
        date = datetime.fromtimestamp(timestamp)
        # ensuring the proper title elements
        lower_title = submission.title.lower()
        post = False
        game = False
        thread = False
        if 'post' in lower_title:
            post = True
        if 'game' in lower_title:
            game = True
        if 'thread' in lower_title:
            thread = True
        proper_title = post and game and thread

        if team in ['KansasCityChiefs']: # deals with commentary on post game threads
            if 'photo' in lower_title:
                proper_title = False
            elif 'had' in lower_title:
                proper_title = False

        if team in ['falcons']: # deals with commentary on post game threads
            if 'can we' in lower_title:
                proper_title = False

        if team in ['cowboys']: # deals with duplicate thread
            if submission.ups < 10:
                proper_title = False

        if team in ['Saints']: # dupe thread
            if ':' not in lower_title:
                proper_title = False

        if team in ['panthers']: # commentary on post game threads
            if 'link' in lower_title:
                proper_title = False

        if team == 'Texans': # commentary on post game threads
            if 'the jags' in lower_title:
                proper_title = False

        # making sure the post was in the 2024-2025 regular season
        if season_end >= date >= season_start:
            if proper_title:
                thread_dict[game_number] = submission.id
                game_number -= 1 # making the key the game number of the season, allows for easy check to ensure 17 threads
    team_thread_dict[teams[i]] = thread_dict

Getting the win/loss schedule for each team.

In [25]:
# Manually enterd using wikipedia pages for each team on the 2024-2025 NFL Season
win_loss_dict = {'Bears': [1,0,0,1,1,1,0,0,0,0,0,0,0,0,0,0,1], 'Cowboys': [1,0,0,1,1,0,0,0,0,0,1,1,0,1,1,0,0], 
                 'Eagles': [1,0,1,0,1,1,1,1,1,1,1,1,1,1,0,1,1], 'Packers': [0,1,1,0,1,1,1,1,0,1,1,1,0,1,1,0,0], 
                 'Seahawks': [1,1,1,0,0,0,1,0,0,1,1,1,1,0,0,1,1], '49ers': [1,0,0,1,0,1,0,1,1,0,0,0,1,0,0,0,0], 
                 'Rams': [0,0,1,0,0,1,1,1,0,1,0,1,1,1,1,1,0], 'Steelers': [1,1,1,0,0,1,1,1,1,1,0,1,1,0,0,0,0], 
                 'Browns': [0,1,0,0,0,0,0,1,0,0,1,0,0,0,0,0,0], 'Broncos': [0,0,1,1,1,0,1,1,0,0,1,1,1,1,0,0,1], 
                 'Giants': [0,0,1,0,1,0,0,0,0,0,0,0,0,0,0,1,0], 'Chiefs': [1,1,1,1,1,1,1,1,1,0,1,1,1,1,1,1,0], 
                 'Falcons': [0,1,0,1,1,1,0,1,1,0,0,0,0,1,1,0,0], 'Saints': [1,1,0,0,0,0,0,0,0,1,1,0,1,0,0,0,0], 
                 'Bengals': [0,0,0,1,0,1,1,0,1,0,0,0,1,1,1,1,1], 'Panthers': [0,0,1,0,0,0,0,0,1,1,0,0,0,0,1,0,1], 
                 'Texans': [1,1,0,1,1,1,0,1,0,0,1,0,1,1,0,0,1], 'Ravens': [0,0,1,1,1,1,1,0,1,1,0,1,0,1,1,1,1],
                 'Commanders': [0,1,1,1,1,0,1,1,1,0,0,0,1,1,1,1,1], 'Raiders': [0,1,0,1,0,0,0,0,0,0,0,0,0,0,1,1,0], 
                 'Jets': [0,1,1,0,0,0,0,0,1,0,0,0,0,1,0,0,1], 'Colts': [0,0,1,1,0,1,1,0,0,0,1,0,1,0,1,0,1], 
                 'Cardinals': [0,1,0,0,1,0,1,1,1,1,0,0,0,1,0,0,1], 'Chargers': [1,1,0,0,1,0,1,1,1,1,0,1,0,0,1,1,1]}

Going through each thread to extract "messages" to be semantically processed.

In [None]:
# defining a function to get related comments as a single block of text
# the goal is to turn the get all routes from root to leaf for each comment tree on a thread
# the idea being if a comment is replying to a comment they should be treated as connected
def comment_depth_traversal(comment_list, results, path = None):
    # initializing a list to store the mini-conversation
    if path is None:
        path = []
    # go through all of the 
    for comment in comment_list:
        if isinstance(comment, praw.models.Comment):
            comment_text = comment.body.replace('\n', ' ').strip()

            if not comment_text:
                continue

            new_path = path + [comment_text]

            if not comment.replies:
                thread_string = ". ".join(new_path)
                results.append(thread_string)
            else:
                comment_depth_traversal(comment.replies, results, path = new_path)


In [55]:
# creating lists to make later dataframe
game_comment_tree_list = []
team_list = []
game_number = []
win_this_week = []
# going through all of the teams that have 17 games
for team in teams:
    # going through all 17 games
    for game in range(1, 18):
        # adding the proper information to the lists for team, game_number, and if they won that week 
        team_list.append(team)
        game_number.append(game)
        win_this_week.append(win_loss_dict[team][game-1])
        # getting submission from submission id
        submission_id = team_thread_dict[team][game]
        submission = agent.submission(id = submission_id)
        comment_threads = []
        submission.comments.replace_more(limit = 5) # limiting as there is a vector limmit for sentiment analysis and to save time
        comment_depth_traversal(submission.comments, comment_threads)
        game_comment_tree_list.append(comment_threads)

Turning the reddit data into a csv file.

In [None]:
won_next_week = [] # just shifting the order down by one and adding None for the last week
for team in teams:
    for game in range(1, 18):
        if game < 17:
            won_next_week.append(win_loss_dict[team][game])
        else:
            won_next_week.append(None)

In [57]:
reddit_dict = {'Team': team_list, 'Comments': game_comment_tree_list, 
               'Game_Number': game_number, 'Won_This_Week': win_this_week,
               'Won_Next_Week': won_next_week}
reddit_df = pd.DataFrame(reddit_dict)
reddit_df.to_csv('data/reddit_comment_data.csv', index = False)