In [1]:
import pandas as pd
import numpy as np
import requests as req
from bs4 import BeautifulSoup
import math
from tqdm import tqdm

In [4]:
import pandas as pd
import requests as req
from bs4 import BeautifulSoup

def link_to_csv_detailed(df_original, hackathon_name):
    """
    Scrape detailed project information and combine with original DataFrame
    
    Args:
        df_original: Original DataFrame with columns: hackathon_name, project_name, project_link, winner
        hackathon_name: Name of the hackathon for output file naming
    """
    count = 0
    detailed_data = []

    for index, row in df_original.iterrows():
        url = row['project_link']
        project_name = row['project_name']
        
        try:
            response = req.get(url)
            response.raise_for_status()  # Raise an exception for bad status codes
            soup = BeautifulSoup(response.text, "html.parser")
            soup = soup.find("div", class_="large-9 columns")
            
            # Initialize section columns with dynamic "What's next" section
            sections = {
                "Inspiration": "",
                "What it does": "",
                "How we built it": "",
                "Challenges we ran into": "",
                "Accomplishments that we're proud of": "",
                "What we learned": "",
                "What's next": ""  # Will be filled dynamically
            }

            # Extract content for each section
            h2 = soup.find_all("h2")
            length_h2 = len(h2)
             #if less than 5 then put all the <p> tag on the inspiration section
            
            # Check if we have the standard project description sections
            standard_sections = ["Inspiration", "What it does", "How we built it", "Challenges we ran into", 
                                "Accomplishments that we're proud of", "What we learned"]
            has_standard_sections = any(header.get_text(strip=True) in standard_sections for header in h2)
            
            if not has_standard_sections:
                # No standard sections found, put all p tags in inspiration section
                all_p_tags = soup.find_all("p")
                inspiration_content = []
                for p in all_p_tags:
                    inspiration_content.append(p.get_text(strip=True))
                sections["Inspiration"] = " ".join(inspiration_content)
            else:
                # Normal processing when standard sections are found
                for header in soup.find_all("h2"):
                    section_name = header.get_text(strip=True)
                    
                    # Handle the dynamic "What's next" section
                    if section_name.startswith("What's next"):
                        sections["What's next"] = extract_section_content(header)
                    elif section_name in sections:
                        sections[section_name] = extract_section_content(header)

            # Built With
            built_with = []
            built_with_section = soup.find("div", id="built-with")
            if built_with_section:
                for tech in built_with_section.find_all("span", class_="cp-tag"):
                    built_with.append(tech.get_text(strip=True))

            # GitHub / external links
            links_list = []
            links_section = soup.find("ul", {"data-role": "software-urls"})
            if links_section:
                for a in links_section.find_all("a", href=True):
                    links_list.append(a["href"])

            # Combine original data with scraped details
            detailed_row = {
                # Original columns
                "hackathon_name": row['hackathon_name'],
                "project_name": row['project_name'],
                "project_link": row['project_link'],
                "winner": row['winner'],
                # New detailed columns
                "inspiration": sections["Inspiration"],
                "what_it_does": sections["What it does"],
                "how_we_built_it": sections["How we built it"],
                "challenges": sections["Challenges we ran into"],
                "accomplishments": sections["Accomplishments that we're proud of"],
                "what_we_learned": sections["What we learned"],
                "next_steps": sections["What's next"],
                "built_with": ", ".join(built_with),
                "external_links": ", ".join(links_list)
            }
            
            detailed_data.append(detailed_row)
            print(f"✅ Processed: {project_name} ({count + 1}/{len(df_original)})")
            
        except Exception as e:
            print(f"❌ Error processing {project_name}: {str(e)}")
            # Add row with original data and empty detailed fields
            detailed_row = {
                "hackathon_name": row['hackathon_name'],
                "project_name": row['project_name'],
                "project_link": row['project_link'],
                "winner": row['winner'],
                "inspiration": "",
                "what_it_does": "",
                "how_we_built_it": "",
                "challenges": "",
                "accomplishments": "",
                "what_we_learned": "",
                "next_steps": "",
                "built_with": "",
                "external_links": ""
            }
            detailed_data.append(detailed_row)
        
        count += 1

    # Save to CSV
    df_out = pd.DataFrame(detailed_data)
    output_filename = f"{hackathon_name}_detailed.csv"
    df_out.to_csv(output_filename, index=False)
    print(f"✅ Saved {len(detailed_data)} detailed projects from {hackathon_name} to {output_filename}")
    
    return df_out

def extract_section_content(header):
    """Helper function to extract content from a section"""
    content = []
    sibling = header.find_next_sibling()
    while sibling and sibling.name != "h2":
        if sibling.name == "p":
            content.append(sibling.get_text(strip=True))
        sibling = sibling.find_next_sibling()
    return " ".join(content)

def extract_section_content_with_tracking(header):
    """Helper function to extract content from a section and track which p tags were used"""
    content = []
    p_tags_used = []
    sibling = header.find_next_sibling()
    while sibling and sibling.name != "h2":
        if sibling.name == "p":
            content.append(sibling.get_text(strip=True))
            p_tags_used.append(sibling)
        sibling = sibling.find_next_sibling()
    return " ".join(content), p_tags_used

# Example usage:
# df_original = pd.read_csv("your_original_file.csv")
# df_detailed = link_to_csv_detailed(df_original, "Auburn Hacks 2024")

In [5]:
# Read the CSV
df_original = pd.read_csv("all_hackathons_merged.csv")

df_detailed = link_to_csv_detailed(df_original, "all_detailed.csv")
df_detailed

✅ Processed: Resumate (1/1071)
✅ Processed: Courage The Not-so Cowardly Rover (2/1071)
✅ Processed: Planetarium (3/1071)
✅ Processed: Orbital Sandbox (4/1071)
✅ Processed: OrbitQuest (5/1071)
✅ Processed: Pseduo (6/1071)
✅ Processed: Auburn Quest (7/1071)
✅ Processed: Starfall (8/1071)
✅ Processed: planet simulation (9/1071)
✅ Processed: Constellation Achievements (10/1071)
✅ Processed: SpacePal (11/1071)
✅ Processed: HAL 9000 Desktop Buddy (12/1071)
✅ Processed: Andromeda (13/1071)
✅ Processed: Sound Safari (14/1071)
✅ Processed: Demo (15/1071)
✅ Processed: Solar Flare Prediction (16/1071)
✅ Processed: Spaceify (17/1071)
✅ Processed: Space smarts (18/1071)
✅ Processed: OnePress (19/1071)
✅ Processed: Mars Meteo (20/1071)
✅ Processed: AstroPedia (21/1071)
✅ Processed: Event Horizon (22/1071)
✅ Processed: Spaced Out (23/1071)
✅ Processed: AlienAtlas (24/1071)
✅ Processed: Nebula Navigators (25/1071)
✅ Processed: Starbite (26/1071)
✅ Processed: Playlist Planet (27/1071)
✅ Processed: Plan

Unnamed: 0,hackathon_name,project_name,project_link,winner,inspiration,what_it_does,how_we_built_it,challenges,accomplishments,what_we_learned,next_steps,built_with,external_links
0,Auburn Hacks 2024,Resumate,https://devpost.com/software/resumate-bwk81m,True,Resumate was inspired by the challenge job see...,Resumate is a web application that utilizes ad...,"We built Resumate using Flask for the backend,...",One of the main challenges was ensuring the AI...,We're proud of creating a tool that can genuin...,"Throughout the development of Resumate, we lea...","Looking ahead, we plan to incorporate more per...","css3, flask, html5, javascript, openai",https://github.com/Michaelgathara/resumate
1,Auburn Hacks 2024,Courage The Not-so Cowardly Rover,https://devpost.com/software/courage-the-not-s...,True,Our inspiration came from the resources NASA p...,"In this Mars Rover educational game, players e...",,,,,,"blendr, c#, mixtral, nasa, nginx, ollama, unity",https://github.com/abearinatrap/auburnhack2024
2,Auburn Hacks 2024,Planetarium,https://devpost.com/software/planetarium-nzc1a7,True,Animated Earth Animated sprite fl studio Clip ...,,,,,,,"clipstudiopaint, flstudio, unity",
3,Auburn Hacks 2024,Orbital Sandbox,https://devpost.com/software/orbital-sandbox,True,Interplanetary space exploration is becoming a...,The simulator allows you to fly a spacecraft a...,"The app was built in the Unity game-engine, wh...",The primary challenge for the design was build...,We are very proud of the real-time simulation ...,,"Hopefully, after the completion of AuburnHACKS...","c#, unity",https://github.com/EirikMulder/HackEpicUnityPr...
4,Auburn Hacks 2024,OrbitQuest,https://devpost.com/software/orbitquest,True,The theme this year is space travel. So why no...,OrbitQuesttracks the ISS as it travels around ...,We usedthree.jsfor the 3-D graphics portion of...,One significant challenge we faced was maintai...,We are extremely proud of the appearance of ou...,,"In the future, we want to implement real-time ...","css, debian, digitalocean, django, git, github...","https://orbitquest.co, https://github.com/TheS..."
...,...,...,...,...,...,...,...,...,...,...,...,...,...
1066,VTHacks 12,SquatMap,https://devpost.com/software/squatmap,False,"Our project, ""SquatDown,"" has goal to assist ...",,,,,,,"css, google-maps, html, javascript, node.js, r...",https://github.com/harshalarakala/squatdown
1067,VTHacks 12,OriGaming,https://devpost.com/software/origaming,False,"Since high school, we have had an interest in ...","0rigAmIng is a ""paper-folding simulator"" that ...",We built 0rigAmIng fully in Godot Engine 3D.,"Where do we even begin? To start, none of us h...",Our first big strides in the project were when...,"Well, we learned that hackathons are crazy! Bu...",,godot,https://github.com/vaaar/OriGaming
1068,VTHacks 12,Ask Hokie,https://devpost.com/software/hokie-plus-plus,False,Ask HOKIE Problem Statement Technical Framewor...,,,,,,,"css3, gpt, llm, mlh, mongodb, propelauth, react",https://github.com/mahatokunal/askHokie
1069,VTHacks 12,GrocerAi,https://devpost.com/software/grocerai,False,,,,,,,,"azure, css, html, javascript, langchain, node....",https://www.lesstimeshoppingandmoretimeto.stud...
