In [2]:
import asyncio
import asyncpraw
import nest_asyncio
import time
import pandas as pd
from datetime import datetime
import re  
import aiohttp

# Allows running async inside Jupyter Notebook or scripts with existing event loops
nest_asyncio.apply()

# Reddit API setup
async def get_reddit_instance():
    return asyncpraw.Reddit(
        client_id="j9JdlLnSugDJxrhTFoXExQ",
        client_secret="QJbefKRgxmm7fY1ZYp6oQL0iv2wgUg",
        user_agent="Hi",
        username="FRD-23",
        password="@_Zurite1721212232005",
        requestor_kwargs={"session": aiohttp.ClientSession()}
    )

# Function to fetch EPL match threads from matchday 20 onward
async def get_matchday_threads(year, start_matchday=20):
    reddit = await get_reddit_instance()
    redditor = await reddit.redditor("MatchThreadder")  
    match_threads = []  
    seen_matches = set()  # Track unique matches to avoid duplicates

    print(f"Searching for EPL match threads from matchday {start_matchday} in {year}...")

    async for submission in redditor.submissions.new(limit=1000):
        try:
            post_time = datetime.utcfromtimestamp(submission.created_utc)
            post_year = post_time.year  

            if post_year != year:
                continue  

            title_lower = submission.title.lower()
            matchday_search = re.search(r"matchday (\d+)", title_lower)
            if matchday_search:
                matchday_number = int(matchday_search.group(1))
                if matchday_number < start_matchday:
                    continue  
            
            if "match thread" in title_lower and "premier league" in title_lower:
                match_title = submission.title  # Use full title for uniqueness check
                if match_title in seen_matches:
                    continue  
                seen_matches.add(match_title)
                
                match_score = extract_match_score(submission.title)
                print(f"Found EPL Thread: {submission.title} {post_time} | {submission.url}")
                match_threads.append({
                    "post_id": submission.id,
                    "post_title": submission.title,
                    "URL": submission.url,
                    "Date": post_time.strftime('%Y-%m-%d'),
                    "Score": match_score
                })
        except Exception as e:
            print(f"Error processing post: {e}")

    return match_threads  

# Function to extract match points (final score) from the title
def extract_match_score(title):
    match = re.search(r"(\d+)\s*-\s*(\d+)", title)
    if match:
        return f"{match.group(1)}-{match.group(2)}"  
    return "Unknown"

# Function to scrape comments and replies from a match thread
async def scrape_comments(match_threads):
    reddit = await get_reddit_instance()
    all_comments = []

    for match in match_threads:
        post_url = match["URL"]
        print(f"\nScraping comments for: {match['post_title']}")

        try:
            submission = await reddit.submission(url=post_url)
            await submission.comments.replace_more(limit=3)  

            for comment in submission.comments.list():  
                try:
                    await asyncio.sleep(1.5)  
                    all_comments.append({
                        "post_id": match["post_id"],
                        "post_title": match["post_title"],
                        "comment_id": comment.id,
                        "comment_text": comment.body,
                        "comment_author": comment.author.name if comment.author else "[deleted]",
                        "comment_score": comment.score,
                        "comment_time": datetime.utcfromtimestamp(comment.created_utc).strftime('%Y-%m-%d %H:%M:%S'),
                        "parent_id": comment.parent_id if comment.parent_id else "None"
                    })

                    # Scrape replies to comments
                    for reply in comment.replies:
                        all_comments.append({
                            "post_id": match["post_id"],
                            "post_title": match["post_title"],
                            "comment_id": reply.id,
                            "comment_text": reply.body,
                            "comment_author": reply.author.name if reply.author else "[deleted]",
                            "comment_score": reply.score,
                            "comment_time": datetime.utcfromtimestamp(reply.created_utc).strftime('%Y-%m-%d %H:%M:%S'),
                            "parent_id": reply.parent_id if reply.parent_id else "None"
                        })
                except Exception as e:
                    print(f"Error processing comment: {e}")
        except Exception as e:
            print(f"Error fetching post: {e}")

    return all_comments

# Input: Year to scrape
year = 2025  
matchday_start = 20  

# Run async tasks
async def main():
    match_threads = await get_matchday_threads(year, matchday_start)

    if match_threads:
        comments_data = await scrape_comments(match_threads)
        df_matches = pd.DataFrame(match_threads)
        csv_filename_matches = f"epl_match_threads_{year}_matchday{matchday_start}_{datetime.now().strftime('%Y%m%d_%H%M%S')}.csv"
        df_matches.to_csv(csv_filename_matches, index=False)
        print(f"\nExported {len(match_threads)} EPL match threads from matchday {matchday_start} onward to {csv_filename_matches}")

        if comments_data:
            df_comments = pd.DataFrame(comments_data)
            csv_filename_comments = f"epl_match_comments_{year}_matchday{matchday_start}_{datetime.now().strftime('%Y%m%d_%H%M%S')}.csv"
            df_comments.to_csv(csv_filename_comments, index=False)
            print(f"\nExported {len(comments_data)} comments and replies to {csv_filename_comments}")
        else:
            print("\nNo comments extracted.")
    else:
        print(f"\nNo EPL match threads found from matchday {matchday_start} onward in {year}.")

# Run the main async function
asyncio.run(main())


Searching for EPL match threads from matchday 20 in 2025...


  post_time = datetime.utcfromtimestamp(submission.created_utc)


Found EPL Thread: Match Thread: West Ham United vs Leicester City | English Premier League 2025-02-27 19:52:06 | https://www.reddit.com/r/soccer/comments/1izoqum/match_thread_west_ham_united_vs_leicester_city/
Found EPL Thread: Match Thread: Liverpool vs Newcastle United | English Premier League 2025-02-26 20:05:58 | https://www.reddit.com/r/soccer/comments/1iywufn/match_thread_liverpool_vs_newcastle_united/
Found EPL Thread: Match Thread: Manchester United vs Ipswich Town | English Premier League 2025-02-26 19:42:17 | https://www.reddit.com/r/soccer/comments/1iywa07/match_thread_manchester_united_vs_ipswich_town/
Found EPL Thread: Match Thread: Brentford vs Everton | English Premier League 2025-02-26 19:28:26 | https://www.reddit.com/r/soccer/comments/1iyvxzb/match_thread_brentford_vs_everton_english_premier/
Found EPL Thread: Match Thread: Nottingham Forest vs Arsenal | English Premier League 2025-02-26 19:28:16 | https://www.reddit.com/r/soccer/comments/1iyvxty/match_thread_nottingh

  "comment_time": datetime.utcfromtimestamp(comment.created_utc).strftime('%Y-%m-%d %H:%M:%S'),
  "comment_time": datetime.utcfromtimestamp(reply.created_utc).strftime('%Y-%m-%d %H:%M:%S'),



Scraping comments for: Match Thread: Liverpool vs Newcastle United | English Premier League

Scraping comments for: Match Thread: Manchester United vs Ipswich Town | English Premier League

Scraping comments for: Match Thread: Brentford vs Everton | English Premier League

Scraping comments for: Match Thread: Nottingham Forest vs Arsenal | English Premier League

Scraping comments for: Match Thread: Tottenham Hotspur vs Manchester City | English Premier League

Scraping comments for: Match Thread: Chelsea vs Southampton | English Premier League

Scraping comments for: Match Thread: Wolverhampton Wanderers vs Fulham | English Premier League

Scraping comments for: Match Thread: Crystal Palace vs Aston Villa | English Premier League

Scraping comments for: Match Thread: Manchester City vs Liverpool | English Premier League

Scraping comments for: Match Thread: Newcastle United vs Nottingham Forest | English Premier League

Scraping comments for: Match Thread: Aston Villa vs Chelsea | En

Unclosed client session
client_session: <aiohttp.client.ClientSession object at 0x0000021E2DE12570>
Unclosed client session
client_session: <aiohttp.client.ClientSession object at 0x0000021E2CB4AC60>



Scraping comments for: Match Thread: Southampton vs Brighton & Hove Albion | English Premier League

Scraping comments for: Match Thread: Fulham vs Crystal Palace | English Premier League

Scraping comments for: Match Thread: Ipswich Town vs Tottenham Hotspur | English Premier League

Scraping comments for: Match Thread: AFC Bournemouth vs Wolverhampton Wanderers | English Premier League

Scraping comments for: Match Thread: Everton vs Manchester United | English Premier League

Scraping comments for: Match Thread: Leicester City vs Brentford | English Premier League

Scraping comments for: Match Thread: Aston Villa vs Liverpool | English Premier League

Scraping comments for: Match Thread: Tottenham Hotspur vs Manchester United | English Premier League

Scraping comments for: Match Thread: Liverpool vs Wolverhampton Wanderers | English Premier League

Scraping comments for: Match Thread: Fulham vs Nottingham Forest | English Premier League

Scraping comments for: Match Thread: West H