In [14]:
%pip install requests beautifulsoup4
import requests
from bs4 import BeautifulSoup

# College website URL
url = "https://www.sanjivanicoe.org.in"

# Send HTTP request
response = requests.get(url)

# Check if request is successful
if response.status_code == 200:
    soup = BeautifulSoup(response.text, "html.parser")

    # Extract page title
    title = soup.title.text
    print("Page Title:", title)

    print("\nAll Links:")
    for link in soup.find_all("a"):
        href = link.get("href")
        text = link.get_text(strip=True)
        print(text, "->", href)
else:
    print("Failed to fetch the webpage")


Note: you may need to restart the kernel to use updated packages.
Page Title: Sanjivani College of Engineering

All Links:
 -> /
Contact -> /index.php/contact
Apply online -> http://www.sanjivanicoe.org.in/files/coeweb/landing-page/college-of-engineering-in-kopargaon-admission-2020.html
 -> /
Home -> /
About Us -> /index.php/about-us/our-institutes
Our Inspiration -> /index.php/about-us/our-inspiration
Chairman's Desk -> /index.php/about-us/chairman-s-desk
Managing Trustee -> /index.php/about-us/managing-trustee
Board Of Trustees -> /index.php/about-us/board-of-trustees
Governing Body -> /index.php/about-us/sz-governing-body
Our Institutes -> /index.php/about-us/our-institutes
Directors's Desk -> /index.php/about-us/principal-s-desk
Video Lectures -> /index.php/about-us/video-lectures
Infrastructure -> /index.php/library/central-library
Computer Center -> /index.php/about-us/infrastructure/computer-center
Workshop -> /index.php/about-us/infrastructure/workshop
Language Lab -> /index.ph

In [15]:
import requests
from bs4 import BeautifulSoup
import csv

# -------------------------
# 1. Target URLs
# -------------------------
base_url = "https://www.sanjivanicoe.org.in"
pages = {
    "home": base_url,
    "director": base_url + "/index.php/about-us/principal-s-desk",
    "committee": base_url + "/index.php/student-zone/committees/library-committee",
    "academic_council": base_url + "/index.php/student-zone/academic-council-member",
    "hod_it": base_url + "/index.php/department/information-technology/hod-s-desk",
}

# Store scraped data
data = {
    "College Name": "",
    "Director": "",
    "Departments": [],
    "HOD IT": "",
    "Staff": []
}

# -------------------------
# 2. Fetch and Parse HTML
# -------------------------
def get_soup(url):
    try:
        r = requests.get(url, timeout=10)
        r.raise_for_status()
        return BeautifulSoup(r.text, "html.parser")
    except Exception as e:
        print(f"Error fetching {url}: {e}")
        return None

# -------------------------------------------------------
# 3. Scrape College Name + Departments from Home Page
# -------------------------------------------------------
soup_home = get_soup(pages["home"])
if soup_home:
    # College name (from title or header)
    title_tag = soup_home.title
    data["College Name"] = title_tag.text.strip() if title_tag else "Sanjivani College of Engineering"
    
    # Departments from menu
    depts = soup_home.select("ul li a")
    for d in depts:
        name = d.get_text(strip=True)
        if name and ("Engineering" in name or "Information Technology" in name):
            data["Departments"].append(name)

# -------------------------
# 4. Scrape Director
# -------------------------
soup_dir = get_soup(pages["director"])
if soup_dir:
    # Look for text near 'Director'
    text = soup_dir.get_text(separator="\n")
    for line in text.split("\n"):
        if "Director" in line and "Dr." in line:
            data["Director"] = line.strip()

# -----------------------------------------------
# 5. Scrape HOD IT
# -----------------------------------------------
soup_hod_it = get_soup(pages["hod_it"])
if soup_hod_it:
    text = soup_hod_it.get_text(separator="\n")
    # Look for a line that likely has an HOD name
    for line in text.split("\n"):
        if "Professor" in line or "Dr." in line or "HOD" in line:
            if "Information Technology" in line or "Head" in line:
                data["HOD IT"] = line.strip()

# -------------------------------------------------
# 6. Scrape Staff from committee pages
# -------------------------------------------------
# A combined approach: check academic council & library committee
for key in ["committee", "academic_council"]:
    soup_page = get_soup(pages[key])
    if soup_page:
        text = soup_page.get_text(separator="\n")
        for line in text.split("\n"):
            line_clean = line.strip()
            if line_clean and ("Dr." in line_clean or "Prof." in line_clean):
                data["Staff"].append(line_clean)

# Remove duplicates
data["Staff"] = list(set(data["Staff"]))

# -------------------------
# 7. Save to CSV
# -------------------------
with open("sanjivani_staff.csv", "w", newline="", encoding="utf-8") as f:
    writer = csv.writer(f)
    writer.writerow(["Category", "Details"])
    writer.writerow(["College Name", data["College Name"]])
    writer.writerow(["Director", data["Director"]])
    writer.writerow(["Departments", ", ".join(data["Departments"])])
    writer.writerow(["HOD IT", data["HOD IT"]])
    writer.writerow(["Staff Names", "|".join(data["Staff"])])

print("Scraping completed! Data saved to sanjivani_staff.csv")


Scraping completed! Data saved to sanjivani_staff.csv


In [16]:
import requests
from bs4 import BeautifulSoup

# ----------------------------
# URLs
# ----------------------------
HOME_URL = "https://www.sanjivanicoe.org.in"
DIRECTOR_URL = "https://www.sanjivanicoe.org.in/index.php/about-us/principal-s-desk"
IT_HOD_URL = "https://www.sanjivanicoe.org.in/index.php/department/information-technology/hod-s-desk"
IT_STAFF_URL = "https://www.sanjivanicoe.org.in/index.php/department/information-technology/faculty"

# ----------------------------
# Helper function
# ----------------------------
def get_soup(url):
    response = requests.get(url, timeout=10)
    response.raise_for_status()
    return BeautifulSoup(response.text, "html.parser")

print("\n========== COLLEGE INFORMATION ==========\n")

# ----------------------------
# 1. College Name
# ----------------------------
home_soup = get_soup(HOME_URL)
college_name = home_soup.title.text.strip()

print("1. College Name:")
print(college_name, "\n")

# ----------------------------
# 2. Director / Principal Name
# ----------------------------
director_soup = get_soup(DIRECTOR_URL)
director_name = "Not Found"

for line in director_soup.get_text(separator="\n").split("\n"):
    line = line.strip()
    if line.startswith("Dr.") or line.startswith("Prof."):
        director_name = line
        break

print("2. Director / Principal Name:")
print(director_name, "\n")

# ----------------------------
# 3. Department Name
# ----------------------------
print("3. Department Name:")
print("Information Technology\n")

# ----------------------------
# 4. HOD Name (IT)
# ----------------------------
hod_soup = get_soup(IT_HOD_URL)
hod_name = "Not Found"

for line in hod_soup.get_text(separator="\n").split("\n"):
    line = line.strip()
    if line.startswith("Dr.") or line.startswith("Prof."):
        hod_name = line
        break

print("4. HOD Name:")
print(hod_name, "\n")

# # ----------------------------
# # 5. Staff Names (IT)
# # ----------------------------
# print("5. IT Department Faculty Members:")
# if staff_names:
#     count = 1
#     for raw_name in sorted(staff_names):
#         final_name = faculty_mapping.get(raw_name, raw_name)
#         print(f"{count}. {final_name}")
#         count += 1
# else:
#     print("Faculty data not found")


print("\n========== END ==========\n")



1. College Name:
Sanjivani College of Engineering 

2. Director / Principal Name:
Dr. R. A. Kapgate 

3. Department Name:
Information Technology

4. HOD Name:
Prof. Dr. Madhuri A.Jawale 





In [20]:
import requests
from bs4 import BeautifulSoup
import re

URL = "https://www.sanjivanicoe.org.in/index.php/department/information-technology/faculty"

response = requests.get(URL, timeout=10)
response.raise_for_status()

soup = BeautifulSoup(response.text, "html.parser")

text = soup.get_text(separator="\n")

faculty_names = []

# Regex for faculty names
pattern = re.compile(
    r"(Prof\. Dr\.|Dr\.|Prof\.|Mr\.|Mrs\.|Miss\.)\s+[A-Za-z.\s]"
)

for line in text.split("\n"):
    line = line.strip()
    if pattern.fullmatch(line):
        if line not in faculty_names:
            faculty_names.append(line)

print("\n======= IT DEPARTMENT FACULTY ==========\n")

if faculty_names:
    for i, name in enumerate(faculty_names, start=1):
        print(f"{i}. {name}")
else:
    print("Faculty data not found")

print("\n=======================================")


ReadTimeout: HTTPSConnectionPool(host='www.sanjivanicoe.org.in', port=443): Read timed out. (read timeout=10)