In [2]:
import pandas as pd
import numpy as np
import requests as req
from bs4 import BeautifulSoup
import math
from tqdm import tqdm

In [6]:
import pandas as pd
import requests as req
from bs4 import BeautifulSoup

def link_to_csv_detailed(df_original, hackathon_name):
    """
    Scrape detailed project information and combine with original DataFrame
    
    Args:
        df_original: Original DataFrame with columns: hackathon_name, project_name, project_link, winner
        hackathon_name: Name of the hackathon for output file naming
    """
    count = 0
    detailed_data = []

    for index, row in df_original.iterrows():
        url = row['project_link']
        project_name = row['project_name']
        
        try:
            response = req.get(url)
            response.raise_for_status()  # Raise an exception for bad status codes
            soup = BeautifulSoup(response.text, "html.parser")
            soup = soup.find("div", class_="large-9 columns")
            
            # Initialize section columns with dynamic "What's next" section
            sections = {
                "Inspiration": "",
                "What it does": "",
                "How we built it": "",
                "Challenges we ran into": "",
                "Accomplishments that we're proud of": "",
                "What we learned": "",
                "What's next": ""  # Will be filled dynamically
            }

            # Extract content for each section
            h2 = soup.find_all("h2")
            length_h2 = len(h2)
             #if less than 5 then put all the <p> tag on the inspiration section
            
            # Check if we have the standard project description sections
            standard_sections = ["Inspiration", "What it does", "How we built it", "Challenges we ran into", 
                                "Accomplishments that we're proud of", "What we learned"]
            has_standard_sections = any(header.get_text(strip=True) in standard_sections for header in h2)
            
            if not has_standard_sections:
                # No standard sections found, put all p tags in inspiration section
                all_p_tags = soup.find_all("p")
                inspiration_content = []
                for p in all_p_tags:
                    inspiration_content.append(p.get_text(strip=True))
                sections["Inspiration"] = " ".join(inspiration_content)
            else:
                # Normal processing when standard sections are found
                for header in soup.find_all("h2"):
                    section_name = header.get_text(strip=True)
                    
                    # Handle the dynamic "What's next" section
                    if section_name.startswith("What's next"):
                        sections["What's next"] = extract_section_content(header)
                    elif section_name in sections:
                        sections[section_name] = extract_section_content(header)

            # Built With
            built_with = []
            built_with_section = soup.find("div", id="built-with")
            if built_with_section:
                for tech in built_with_section.find_all("span", class_="cp-tag"):
                    built_with.append(tech.get_text(strip=True))

            # GitHub / external links
            links_list = []
            links_section = soup.find("ul", {"data-role": "software-urls"})
            if links_section:
                for a in links_section.find_all("a", href=True):
                    links_list.append(a["href"])

            # Combine original data with scraped details
            detailed_row = {
                # Original columns
                "hackathon_name": row['hackathon_name'],
                "project_name": row['project_name'],
                "project_link": row['project_link'],
                "winner": row['winner'],
                # New detailed columns
                "inspiration": sections["Inspiration"],
                "what_it_does": sections["What it does"],
                "how_we_built_it": sections["How we built it"],
                "challenges": sections["Challenges we ran into"],
                "accomplishments": sections["Accomplishments that we're proud of"],
                "what_we_learned": sections["What we learned"],
                "next_steps": sections["What's next"],
                "built_with": ", ".join(built_with),
                "external_links": ", ".join(links_list)
            }
            
            detailed_data.append(detailed_row)
            print(f"✅ Processed: {project_name} ({count + 1}/{len(df_original)})")
            
        except Exception as e:
            print(f"❌ Error processing {project_name}: {str(e)}")
            # Add row with original data and empty detailed fields
            detailed_row = {
                "hackathon_name": row['hackathon_name'],
                "project_name": row['project_name'],
                "project_link": row['project_link'],
                "winner": row['winner'],
                "inspiration": "",
                "what_it_does": "",
                "how_we_built_it": "",
                "challenges": "",
                "accomplishments": "",
                "what_we_learned": "",
                "next_steps": "",
                "built_with": "",
                "external_links": ""
            }
            detailed_data.append(detailed_row)
        
        count += 1

    # Save to CSV
    df_out = pd.DataFrame(detailed_data)
    output_filename = f"{hackathon_name}_detailed.csv"
    df_out.to_csv(output_filename, index=False)
    print(f"✅ Saved {len(detailed_data)} detailed projects from {hackathon_name} to {output_filename}")
    
    return df_out

def extract_section_content(header):
    """Helper function to extract content from a section"""
    content = []
    sibling = header.find_next_sibling()
    while sibling and sibling.name != "h2":
        if sibling.name == "p":
            content.append(sibling.get_text(strip=True))
        sibling = sibling.find_next_sibling()
    return " ".join(content)

def extract_section_content_with_tracking(header):
    """Helper function to extract content from a section and track which p tags were used"""
    content = []
    p_tags_used = []
    sibling = header.find_next_sibling()
    while sibling and sibling.name != "h2":
        if sibling.name == "p":
            content.append(sibling.get_text(strip=True))
            p_tags_used.append(sibling)
        sibling = sibling.find_next_sibling()
    return " ".join(content), p_tags_used

# Example usage:
# df_original = pd.read_csv("your_original_file.csv")
# df_detailed = link_to_csv_detailed(df_original, "Auburn Hacks 2024")

In [9]:
# Read the CSV
df_original = pd.read_csv("all_projects.csv")
df_detailed = link_to_csv_detailed(df_original, "All_projects_with_Details")
df_detailed

✅ Processed: QuaranTime (1/7495)
✅ Processed: FridgeSpace (2/7495)
✅ Processed: helppier (3/7495)
✅ Processed: mashme (4/7495)
✅ Processed: Mall Monitor (5/7495)
✅ Processed: CAST3D - Scanner for Medicinal Casts (6/7495)
✅ Processed: Legist (7/7495)
✅ Processed: Jump! (8/7495)
✅ Processed: Temporal Rift (9/7495)
✅ Processed: Ctrl+Air+Space (10/7495)
✅ Processed: Karaoke Party (11/7495)
✅ Processed: trytobreak.me (12/7495)
✅ Processed: Recipe2Go (13/7495)
✅ Processed: CopyCat (14/7495)
✅ Processed: Merge CountFlicts (15/7495)
✅ Processed: Desky bot (16/7495)
✅ Processed: FitBuddy (17/7495)
✅ Processed: QuickMark (18/7495)
✅ Processed: Tabular (19/7495)
✅ Processed: Corona Escape (20/7495)
✅ Processed: ClinicConnect (21/7495)
✅ Processed: Subspace (22/7495)
✅ Processed: groupShot (23/7495)
✅ Processed: flock (24/7495)
✅ Processed: PriceEasy (25/7495)
✅ Processed: Sofia: The No-Internet-Needed Personal Assistant (26/7495)
✅ Processed: Paint Pong (27/7495)
✅ Processed: VacAlert (28/7495)
✅

Unnamed: 0,hackathon_name,project_name,project_link,winner,inspiration,what_it_does,how_we_built_it,challenges,accomplishments,what_we_learned,next_steps,built_with,external_links
0,Hack The North 2020,QuaranTime,https://devpost.com/software/boredombuster-hnimzc,True,Due to being stuck at home during Quarantine s...,QuaranTime is an app where a schedule is gener...,We decided to face a new challenge. Due to the...,There were many challenges that we faced. One ...,We are proud of the ability to use the Google ...,We were able to learn much more than expected ...,"If we had more time, we would add features suc...","dart, flutter, googleapis",https://github.com/JackFrostDJ/HTN-App/tree/ma...
1,Hack The North 2020,FridgeSpace,https://devpost.com/software/fridgespace,True,Based on the observation that most household w...,FridgeSpace allows household users to locate a...,We began by creating designs for our app using...,The biggest challenge we had to face was learn...,,There were many new technologies we learned in...,Future Steps for our team include working with...,"cockroachdb, flask, python, sqlalchemy, swift,...",https://github.com/Harin329/FridgeSpace
2,Hack The North 2020,helppier,https://devpost.com/software/helppier-k5lbru,True,Inspired by the newly born solidarity within t...,Helppier's goal is to encourage a new social n...,In order to build a decentralized network wher...,While the developers of the team have experien...,We are extremely proud to have been able to de...,,,"azure, blockchain, cockroach.db, css3, express...","https://github.com/aminecs/magic4, https://www..."
3,Hack The North 2020,mashme,https://devpost.com/software/mashme,True,We're a team of 'music people' who are in a ba...,"mashme allows the user to select two songs, an...",,,,,ML training for chorus and verse separation! C...,"bootstrap, domain.com, flask, google-compute-e...",http://mashme.tech
4,Hack The North 2020,Mall Monitor,https://devpost.com/software/mall-monitor,True,Covid-19 has ravaged the World since early 202...,Mall Monitor allows owners and guests to monit...,,,,,Actually deploying and consuming the REST API ...,"angular.js, azure, css, flask, html, javascrip...","https://github.com/RemeAjayi/mall-monitor, htt..."
...,...,...,...,...,...,...,...,...,...,...,...,...,...
7490,HackRice 14 2024,Degree Map AI,https://devpost.com/software/degree-planner-ai,False,As current undergraduate students trying to na...,Degree Map AI includes a React front-end and a...,"We first started by working on the scraper, fr...",One challenge we faced was figuring out the be...,We're proud to have successfully connected the...,"We learned to set up OpenAI's ChatGPT, scrape ...","Given the limited amount of time, we were only...","c++, javascript, openai, pyflask, python, react",https://github.com/april2546/HackRice2024
7491,HackRice 14 2024,Money Mania,https://devpost.com/software/money-mania,False,Our inspiration for this project was this book...,This game teaches people about personal financ...,We built this in Python using PyGame. We split...,None of us have ever participated in a hackath...,We are proud of the fact we have a finished pr...,We learned that creating a layout in PyGame is...,Money Mania will have expansions in the invest...,"pygame, python",https://github.com/ashwinrao1/Hackathon24
7492,HackRice 14 2024,AudioAnalyze,https://devpost.com/software/audioanalyze-muw9zk,False,The inspiration behind this project came from ...,,,,,,,"ai, generative, lambda-(for-serverless-process...",https://github.com/Ayush374/Ai_Audio_Analysis
7493,HackRice 14 2024,RestoreIO,https://devpost.com/software/tbd-ir3qd5,False,RestoreIO was created in response to the deman...,,,,,,,"ai, computer, computervision, love, mediapipe,...",https://github.com/OfficialCodeVoyage/HackRice...
