In [None]:
import requests
from bs4 import BeautifulSoup
import pandas as pd
import time
from urllib.parse import urljoin

In [9]:
BASE_URL = "https://www.daiict.ac.in"
FACULTY_URL = "https://www.daiict.ac.in/faculty"

HEADERS = {
    "User-Agent": "AcademicProjectBot/1.0 (For educational use only)"
}

DELAY = 2  # seconds between requests (polite crawling)

def fetch_page(url):
    """Fetch a webpage safely."""
    try:
        response = requests.get(url, headers=HEADERS, timeout=15)
        response.raise_for_status()
        return response.text
    except requests.exceptions.RequestException as e:
        print(f"[ERROR] Failed to fetch {url} -> {e}")
        return None


def parse_faculty_list(html):
    """Extract faculty profile links from the main faculty page."""
    soup = BeautifulSoup(html, "html.parser")

    faculty_links = []

    # Inspect the page structure:
    # Faculty entries are typically in <a> tags linking to individual profiles
    for link in soup.select("a[href]"):
        href = link.get("href")

        # Keep only faculty profile links
        if href and "/faculty/" in href:
            full_url = urljoin(BASE_URL, href)
            if full_url not in faculty_links:
                faculty_links.append(full_url)

    return faculty_links


def parse_faculty_profile(html, profile_url):
    """Extract structured information from an individual faculty profile."""
    soup = BeautifulSoup(html, "html.parser")

    data = {
        "name": None,
        "designation": None,
        "qualification": None,
        "research_interests": None,
        "profile_url": profile_url
    }

    # --- NAME ---
    name_tag = soup.find("h1")
    if name_tag:
        data["name"] = name_tag.get_text(strip=True)

    # --- DESIGNATION / ROLE ---
    designation = soup.find("div", class_="field--name-field-designation")
    if designation:
        data["designation"] = designation.get_text(strip=True)

    # --- QUALIFICATION ---
    qualification = soup.find("div", class_="field--name-field-qualification")
    if qualification:
        data["qualification"] = qualification.get_text(strip=True)

    # --- RESEARCH INTERESTS ---
    research = soup.find("div", class_="field--name-field-research-interests")
    if research:
        data["research_interests"] = research.get_text(" ", strip=True)

    return data


def main():
    print("[INFO] Fetching faculty listing page...")
    main_page = fetch_page(FACULTY_URL)

    if not main_page:
        print("[ERROR] Could not load faculty page.")
        return

    print("[INFO] Parsing faculty links...")
    faculty_links = parse_faculty_list(main_page)
    print(f"[INFO] Found {len(faculty_links)} faculty profile links")

    faculty_data = []

    for idx, profile_url in enumerate(faculty_links, start=1):
        print(f"[INFO] ({idx}/{len(faculty_links)}) Fetching: {profile_url}")
        html = fetch_page(profile_url)

        if html:
            data = parse_faculty_profile(html, profile_url)
            faculty_data.append(data)

        time.sleep(DELAY)  # Polite crawling

    # Save to CSV
    df = pd.DataFrame(faculty_data)
    df.to_csv("daiict_faculty_data.csv", index=False)

    print("[SUCCESS] Data saved to daiict_faculty_data.csv")
    print(df.head())
    return df


if __name__ == "__main__":
    df = main()


[INFO] Fetching faculty listing page...
[INFO] Parsing faculty links...
[INFO] Found 66 faculty profile links
[INFO] (1/66) Fetching: https://www.daiict.ac.in/faculty/abhishek-gupta
[INFO] (2/66) Fetching: https://www.daiict.ac.in/faculty/abhishek-jindal
[INFO] (3/66) Fetching: https://www.daiict.ac.in/faculty/abhishek-tilva
[INFO] (4/66) Fetching: https://www.daiict.ac.in/faculty/aditya-tatu
[INFO] (5/66) Fetching: https://www.daiict.ac.in/faculty/ajay-beniwal
[INFO] (6/66) Fetching: https://www.daiict.ac.in/faculty/amit-mankodi
[INFO] (7/66) Fetching: https://www.daiict.ac.in/faculty/anil-roy
[INFO] (8/66) Fetching: https://www.daiict.ac.in/faculty/anish-mathuria
[INFO] (9/66) Fetching: https://www.daiict.ac.in/faculty/ankit-vijayvargiya
[INFO] (10/66) Fetching: https://www.daiict.ac.in/faculty/anupam-rana
[INFO] (11/66) Fetching: https://www.daiict.ac.in/faculty/arnab-kumar-ray
[INFO] (12/66) Fetching: https://www.daiict.ac.in/faculty/arpit-rana
[INFO] (13/66) Fetching: https://www.

In [10]:
df

Unnamed: 0,name,designation,qualification,research_interests,profile_url
0,logo,,,,https://www.daiict.ac.in/faculty/abhishek-gupta
1,logo,,,,https://www.daiict.ac.in/faculty/abhishek-jindal
2,logo,,,,https://www.daiict.ac.in/faculty/abhishek-tilva
3,logo,,,,https://www.daiict.ac.in/faculty/aditya-tatu
4,logo,,,,https://www.daiict.ac.in/faculty/ajay-beniwal
...,...,...,...,...,...
61,logo,,,,https://www.daiict.ac.in/faculty/tapas-kumar-m...
62,logo,,,,https://www.daiict.ac.in/faculty/tathagata-ban...
63,logo,,,,https://www.daiict.ac.in/faculty/vinay-palaparthy
64,logo,,,,https://www.daiict.ac.in/faculty/yash-agrawal


In [11]:
df.fillna("Not Available", inplace=True)

In [20]:
item = {1:11,3:None,4:"", 5:[], 6:[1,2]}
item.get(1)

11

In [5]:
item.get(2)

In [21]:
cleaned_data = {key: (item.get(key) if item.get(key) else None) for key in item.keys()}

re.sub(r'\D', '', '  34')

In [25]:
import re
re.sub(r'\D', '', '  a1 34')

'134'

In [18]:
{key: (item.get(key) if item.get(key) else None) for key in item.keys()}

{1: 11, 3: None, 4: None}

In [19]:
{key: item.get(key) for key in item.keys()}

{1: 11, 3: None, 4: ''}