In [1]:
import pandas as pd
import numpy as np
import requests as req
from bs4 import BeautifulSoup
import math
from tqdm import tqdm


In [2]:
names = [
    "HackHarvard 2024",
    "HackDearborn 2024",
    "HackKU25",
    "VTHacks 12",
    "RevolutionUC 2025",
    "MIT Reality Hack 2024",
    "TreeHacks 2025",
    "IDEA Hacks 2024",
    "HackNC 2024",
    "Auburn Hacks 2024",
    "Victor Hacks",
    "BiolerMake_X_2023",
    "IrvineHacks_2024"
]
submissions = [141, 68, 96, 178, 60, 98, 257, 48, 94, 31, 18, 80 , 88]
base_urls = [
    "https://hackharvard-2024.devpost.com/project-gallery?",
    "https://hackdearborn3.devpost.com/project-gallery?",
    "https://hackku-2025.devpost.com/project-gallery?",
    "https://vthacks-12.devpost.com/project-gallery?",
    "https://revolutionuc-2025.devpost.com/project-gallery?",
    "https://mit-reality-hack-2024.devpost.com/project-gallery?",
    "https://treehacks-2025.devpost.com/project-gallery?",
    "https://idea-hacks-2024.devpost.com/project-gallery?",
    "https://hacknc-2024.devpost.com/project-gallery?",
    "https://auburnhacks-2024.devpost.com/project-gallery?",
    "https://victor-hacks-nku.devpost.com/project-gallery?",
    "https://boilermake-x.devpost.com/project-gallery?",
    "https://irvinehacks-2024.devpost.com/project-gallery?"
]
all_project_links = {}
j = 0
for base_url, sub_count in zip(base_urls, submissions):
    pages = math.ceil(sub_count / 24)  # round up to get number of pages
    project_links = [f"{base_url}page={i}" for i in range(1, pages + 1)]
    all_project_links[names[j]] = project_links
    j += 1
# Example: print links for HackHarvard
all_project_links

{'HackHarvard 2024': ['https://hackharvard-2024.devpost.com/project-gallery?page=1',
  'https://hackharvard-2024.devpost.com/project-gallery?page=2',
  'https://hackharvard-2024.devpost.com/project-gallery?page=3',
  'https://hackharvard-2024.devpost.com/project-gallery?page=4',
  'https://hackharvard-2024.devpost.com/project-gallery?page=5',
  'https://hackharvard-2024.devpost.com/project-gallery?page=6'],
 'HackDearborn 2024': ['https://hackdearborn3.devpost.com/project-gallery?page=1',
  'https://hackdearborn3.devpost.com/project-gallery?page=2',
  'https://hackdearborn3.devpost.com/project-gallery?page=3'],
 'HackKU25': ['https://hackku-2025.devpost.com/project-gallery?page=1',
  'https://hackku-2025.devpost.com/project-gallery?page=2',
  'https://hackku-2025.devpost.com/project-gallery?page=3',
  'https://hackku-2025.devpost.com/project-gallery?page=4'],
 'VTHacks 12': ['https://vthacks-12.devpost.com/project-gallery?page=1',
  'https://vthacks-12.devpost.com/project-gallery?page=

In [3]:
len(all_project_links) # 13  = 10 + 3. Correct

13

In [4]:
def link_to_csv(links, hackathon_name):
    project_data = []

    for url in links:
        response = req.get(url)
        soup = BeautifulSoup(response.text, "html.parser")
        links_soup = soup.find_all("a", class_="block-wrapper-link fade link-to-software")

        for a in links_soup:
            href = a["href"]
            title_tag = a.find("h5")
            title = title_tag.get_text(strip=True) if title_tag else ""
            winner_tag = a.find("img", class_="winner")
            is_winner = True if winner_tag else False

            project_data.append({
                "hackathon_name": hackathon_name,
                "project_name": title,
                "project_link": href,
                "winner": is_winner
            })

    df = pd.DataFrame(project_data)
    df.to_csv(f"{hackathon_name}.csv", index=False)
    print(f"✅ Saved {len(project_data)} projects from {hackathon_name} to {hackathon_name}.csv")

In [5]:
# Loop through all hackathons with a progress bar
for hackathon_name, links in tqdm(all_project_links.items(), desc="Processing Hackathons"):
    link_to_csv(links, hackathon_name)

Processing Hackathons:   8%|████▌                                                       | 1/13 [00:05<01:07,  5.65s/it]

✅ Saved 141 projects from HackHarvard 2024 to HackHarvard 2024.csv


Processing Hackathons:  15%|█████████▏                                                  | 2/13 [00:08<00:41,  3.75s/it]

✅ Saved 68 projects from HackDearborn 2024 to HackDearborn 2024.csv


Processing Hackathons:  23%|█████████████▊                                              | 3/13 [00:11<00:34,  3.44s/it]

✅ Saved 96 projects from HackKU25 to HackKU25.csv


Processing Hackathons:  31%|██████████████████▍                                         | 4/13 [00:17<00:41,  4.65s/it]

✅ Saved 178 projects from VTHacks 12 to VTHacks 12.csv


Processing Hackathons:  38%|███████████████████████                                     | 5/13 [00:19<00:30,  3.82s/it]

✅ Saved 60 projects from RevolutionUC 2025 to RevolutionUC 2025.csv


Processing Hackathons:  46%|███████████████████████████▋                                | 6/13 [00:23<00:26,  3.81s/it]

✅ Saved 98 projects from MIT Reality Hack 2024 to MIT Reality Hack 2024.csv


Processing Hackathons:  54%|████████████████████████████████▎                           | 7/13 [00:32<00:33,  5.55s/it]

✅ Saved 257 projects from TreeHacks 2025 to TreeHacks 2025.csv


Processing Hackathons:  62%|████████████████████████████████████▉                       | 8/13 [00:34<00:21,  4.33s/it]

✅ Saved 48 projects from IDEA Hacks 2024 to IDEA Hacks 2024.csv


Processing Hackathons:  69%|█████████████████████████████████████████▌                  | 9/13 [00:37<00:15,  3.98s/it]

✅ Saved 94 projects from HackNC 2024 to HackNC 2024.csv


Processing Hackathons:  77%|█████████████████████████████████████████████▍             | 10/13 [00:39<00:09,  3.22s/it]

✅ Saved 31 projects from Auburn Hacks 2024 to Auburn Hacks 2024.csv


Processing Hackathons:  85%|█████████████████████████████████████████████████▉         | 11/13 [00:40<00:04,  2.45s/it]

✅ Saved 18 projects from Victor Hacks to Victor Hacks.csv


Processing Hackathons:  92%|██████████████████████████████████████████████████████▍    | 12/13 [00:42<00:02,  2.58s/it]

✅ Saved 80 projects from BiolerMake_X_2023 to BiolerMake_X_2023.csv


Processing Hackathons: 100%|███████████████████████████████████████████████████████████| 13/13 [00:45<00:00,  3.53s/it]

✅ Saved 88 projects from IrvineHacks_2024 to IrvineHacks_2024.csv





In [6]:
#old code:

In [7]:
#project_link = [f"https://irvinehacks-2024.devpost.com/project-gallery?page={i}" for i in range(1, 5)]

In [8]:
#This code does for one at a time: POC
'''
import requests as req
from bs4 import BeautifulSoup
import pandas as pd

def link_to_csv(links, hackathon_name):
    project_data = []  # will hold dicts with hackathon, project_name, project_link, winner

    for url in links:   # loop directly over URLs
        response = req.get(url)
        soup = BeautifulSoup(response.text, "html.parser")

        # find all project <a> tags
        links_soup = soup.find_all("a", class_="block-wrapper-link fade link-to-software")
        
        # extract href, name, and winner info
        for a in links_soup:
            href = a["href"]
            title_tag = a.find("h5")
            title = title_tag.get_text(strip=True) if title_tag else ""
            
            # check if there is a winner badge
            winner_tag = a.find("img", class_="winner")
            is_winner = True if winner_tag else False
            
            project_data.append({
                "hackathon_name": hackathon_name,
                "project_name": title,
                "project_link": href,
                "winner": is_winner
            })

    # save to CSV
    df = pd.DataFrame(project_data)
    df.to_csv(f"{hackathon_name}.csv", index=False)
    print(f"✅ Saved {len(project_data)} projects from {hackathon_name} to {hackathon_name}.csv")

'''


'\nimport requests as req\nfrom bs4 import BeautifulSoup\nimport pandas as pd\n\ndef link_to_csv(links, hackathon_name):\n    project_data = []  # will hold dicts with hackathon, project_name, project_link, winner\n\n    for url in links:   # loop directly over URLs\n        response = req.get(url)\n        soup = BeautifulSoup(response.text, "html.parser")\n\n        # find all project <a> tags\n        links_soup = soup.find_all("a", class_="block-wrapper-link fade link-to-software")\n        \n        # extract href, name, and winner info\n        for a in links_soup:\n            href = a["href"]\n            title_tag = a.find("h5")\n            title = title_tag.get_text(strip=True) if title_tag else ""\n            \n            # check if there is a winner badge\n            winner_tag = a.find("img", class_="winner")\n            is_winner = True if winner_tag else False\n            \n            project_data.append({\n                "hackathon_name": hackathon_name,\n      

In [11]:

import pandas as pd
import glob

# Get all CSV files in the current directory (assuming they end with .csv)
csv_files = glob.glob("*.csv")

# List to hold dataframes
dfs = []

# Read each CSV and append to the list
for file in csv_files:
    df = pd.read_csv(file)
    dfs.append(df)

# Concatenate all dataframes into one
merged_df = pd.concat(dfs, ignore_index=True)

# Save the merged dataframe to a new CSV
merged_df.to_csv("all_hackathons_merged.csv", index=False)
print(f"✅ Merged {len(csv_files)} CSV files into all_hackathons_merged.csv with {len(merged_df)} total projects.")


✅ Merged 10 CSV files into all_hackathons_merged.csv with 1071 total projects.
