In [19]:
import requests
from bs4 import BeautifulSoup
import json
import re
import os
from bs4 import BeautifulSoup as bs

In [20]:
with open("KLU.html", encoding="utf-8") as fp:
    soup = BeautifulSoup(fp, "html.parser")

In [21]:
x = soup.find('div', class_="about-section-reserve-height")

In [22]:
def clean_text(text):
    # Remove extra whitespace and newlines
    return re.sub(r'\s+', ' ', text).strip()

In [23]:
def extract_text_from_element(element):
    if element.name in ['p', 'ul', 'ol', 'table']:
        return clean_text(element.get_text())
    return ""

In [24]:
def scrape(html_content):
    soup = BeautifulSoup(html_content, 'html.parser')

    # Remove all script and style elements
    for script in soup(["script", "style", "img", "link", "href"]):
        script.decompose()

    parse_data = {
        "title": "",
        "meta_description": "",
        "keywords": "",
        "basic_info": {},
    }

    # Extract title
    title_tag = soup.find('title')
    if title_tag:
        parse_data["title"] = clean_text(title_tag.string)

    # Extract meta description
    meta_desc = soup.find('meta', attrs={'name': 'description'})
    if meta_desc:
        parse_data["meta_description"] = clean_text(meta_desc.get('content', ''))

    # Extract keywords
    keywords = soup.find('meta', attrs={'name': 'keywords'})
    if keywords:
        parse_data["keywords"] = clean_text(keywords.get('content', ''))

    # Extract structured data
    structured_data_tags = soup.find_all('script', type='application/ld+json')
    for tag in structured_data_tags:
        try:
            data = json.loads(tag.string)
            parse_data["structured_data"].append(data)
        except json.JSONDecodeError:
            pass  # Ignore invalid JSON

    # Extract basic information from the first section
    parse_data["basic_info"]=[]
    parse_data["Section"] = {}
    about_section = soup.find('section', class_='bg-white rounded-16 p-6')
    if about_section:
        parse_data["Section"]["name"] = clean_text(about_section.find('h2').text) if about_section.find('h2') else ""
        content_div = about_section.find('div', class_='content-section')
        if content_div:
            paragraphs = content_div.find_all('p')
            parse_data["Section"]["description"] = "\n".join([clean_text(p.text) for p in paragraphs])
            
            # Find all headers (h2, h3, h4, etc.) in the content div
            headers = content_div.find_all(['h2', 'h3', 'h4', 'h5', 'h6'])
            
            for header in headers:
                header_text = clean_text(header.text)
                description = []
                
                # Collect all paragraph siblings until the next header or end of content
                for sibling in header.find_next_siblings():
                    if sibling.name in ['h2', 'h3', 'h4', 'h5', 'h6']:
                        break
                    if sibling.name == 'p':
                        description.append(extract_text_from_element(sibling))
                
                parse_data["basic_info"].append({
                    "name": header_text,
                    "description": [item for item in description if item]
                })
                
    
    # extract tabular data
    tables = soup.find_all('table', class_=lambda x: x is not None and x.strip() != "")
    for table in tables:
        header = table.find_previous('h2')
        header_text = header.text.strip() if header else "No header found"

        # Extract headers
        headers = [th.text.strip() for th in table.find_all('th')]

        # Initialize the result list
        table_data = []

        # Track rowspans
        rowspan_tracker = [0] * len(headers)

        for row in table.find_all('tr')[1:]:  # Skip the header row
            cells = row.find_all(['td', 'th'])
            row_data = {}
            col_index = 0

            for cell in cells:
                while col_index < len(headers) and rowspan_tracker[col_index] > 0:
                    if table_data:
                        row_data[headers[col_index]] = table_data[-1].get(headers[col_index], "")
                    rowspan_tracker[col_index] -= 1
                    col_index += 1

                if col_index < len(headers):
                    rowspan = int(cell.get('rowspan', 1))
                    colspan = int(cell.get('colspan', 1))

                    for _ in range(colspan):
                        if col_index < len(headers):
                            row_data[headers[col_index]] = cell.text.strip()
                            if rowspan > 1:
                                rowspan_tracker[col_index] = rowspan - 1
                            col_index += 1

            if row_data:  # Avoid adding empty rows
                table_data.append(row_data)

        parse_data[header_text] = table_data


    
    # Initialize an empty list to store the FAQs
    faqs = []
    # Find all the question and answer pairs
    faq_div = soup.find('div', class_='cdcms_faqs')
    questions = faq_div.find_all('p', class_='accordio')
    answers = faq_div.find_all('div', class_='liv')

    # Iterate through the questions and answers and store them in a structured format
    for question, answer in zip(questions, answers):
        faq_item = {
            'question': question.get_text(strip=True).replace("Ques. ", ""),
            'answer': answer.get_text(strip=True).replace("Ans. ", "")
        }
        faqs.append(faq_item)
    parse_data['faqs']=faqs
    return parse_data

In [25]:
# Assuming the HTML content is stored in a file named 'vit_vellore.html'
file_name = 'SRM.html'

def Scraper(file_name):
    name = os.path.splitext(file_name)[0]
    with open(file_name, 'r', encoding='utf-8') as file:
        html_content = file.read()

    # Run the scraper
    scraped_data = scrape(html_content)

    # Save the scraped data to a JSON file
    with open(f'{name}.json', 'w', encoding='utf-8') as json_file:
        json.dump(scraped_data, json_file, ensure_ascii=False, indent=4)

    print("Focused scraping completed. Data saved to focused_data.json\n", scraped_data)

In [26]:
Scraper('SRM.html')

Focused scraping completed. Data saved to focused_data.json
 {'title': 'SRM University (SRMIST) Chennai: Admission 2024 (Open), Courses, Fees, Placements', 'meta_description': 'SRM Institute of Science and Technology Chennai is spread over 250 acres and offers UG, PG and Doctoral programs in the fields of Engineering & Technology, Management, Medicine & Health Sciences, Science & Humanities, Law and Agricultural Sciences. SRM University is a Deemed-to-be University located in Chennai, Tamil Nadu was established in 1985.', 'keywords': 'SRM University, SRM University Chennai, SRM IST Chennai, SRM University Fees, SRMJEEE fee stucture, SRM University Scholarship, SRM Chennai Btech fees, SRM IST, SRM Institute of Science and Technology, SRM Institute of Science and Technology dates, SRM Institute of Science and Technology admission 2023, SRM Institute of Science and Technology admission 2024, SRM Institute of Science and Technology fees, SRM Institute of Science and Technology ranking, SRM