In [1]:
import requests
from bs4 import BeautifulSoup
import time

In [None]:
# extracting info from webpage
def scrape_ics_article(url):
    res = requests.get(url, headers ={
    "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) " +
                  "AppleWebKit/537.36 (KHTML, like Gecko) " +
                  "Chrome/114.0.0.0 Safari/537.36"
    })
    soup = BeautifulSoup(res.content, "html.parser")


    paragraphs = []
    for p in soup.find_all("p"):
        text = p.get_text(strip=True)
        if text:  # skip empty
            paragraphs.append(text)

    content = "\n".join(paragraphs)
    return content

In [None]:
# extracting chunks from several webpages
links = ['https://www.iitk.ac.in/counsel/events.php', 'https://www.iitk.ac.in/counsel/workshop-sessions.php', 'https://www.iitk.ac.in/counsel/academic-support.php', 'https://www.iitk.ac.in/counsel/ug-information.php', 'https://www.iitk.ac.in/counsel/pg-information.php', ]
big_chunks = []
for url in links:
    big_chunks.append(scrape_ics_article(url))

In [8]:
print(len(big_chunks))

5


In [None]:
# chunkifying function which shortens data chunks to processable ones 
import nltk
nltk.download('punkt')
nltk.download('punkt_tab')
from nltk.tokenize import sent_tokenize, word_tokenize

def chunk_text(text, max_words=400, overlap=100):
    sentences = sent_tokenize(text)
    
    chunks = []
    current_chunk = []
    current_word_count = 0

    i = 0
    while i < len(sentences):
        sentence = sentences[i]
        word_count = len(word_tokenize(sentence))

        if current_word_count + word_count <= max_words:
            current_chunk.append(sentence)
            current_word_count += word_count
            i += 1
        else:
            # Save chunk
            chunk_text = ' '.join(current_chunk).strip()
            chunks.append(chunk_text)

            # Move back overlap words
            if overlap > 0:
                backtrack_words = 0
                backtrack_index = len(current_chunk) - 1
                while backtrack_index >= 0 and backtrack_words < overlap:
                    backtrack_words += len(word_tokenize(current_chunk[backtrack_index]))
                    backtrack_index -= 1
                current_chunk = current_chunk[backtrack_index+1:]
                current_word_count = sum(len(word_tokenize(s)) for s in current_chunk)
            else:
                current_chunk = []
                current_word_count = 0

    # Save any remaining chunk
    if current_chunk:
        chunks.append(' '.join(current_chunk).strip())

    return chunks

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\shilp\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package punkt_tab to
[nltk_data]     C:\Users\shilp\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!


In [None]:
# dividing data into smaller chunks
small_chunks = []
for chonkies in big_chunks:
    for small_chonkies in chunk_text(chonkies):
        small_chunks.append(small_chonkies)

print(small_chunks[:5])

['Quoting our alumni, “Orientation is one of the best memories of IITK." The Institute Counselling Service conducted a 11-day long Orientation for the new Freshers batch to help them get acclimatized to campus and bond with their batchmates. It is the largest College Orientation programme in the nation. Students are introduced to Academic Structure, Administrative Structure, Students\' Gymkhana, Clubs & Societies and the various opportunities IITK has for its students. Orientation also contains games, wing activities, interactive sessions and a campus tour with the aim to nurture the newly formed bonds among the students. Memories made here stay forever. World Suicide Prevention Day is an awareness day observed on 10 September every year to provide worldwide commitment and action to prevent suicide with various activities worldwide since 2003. IIT Kanpur also organises events every year to raise awareness that suicide is preventable and to decrease stigma regarding suicide. This year t

In [11]:
print(len(small_chunks))

61


In [None]:
# cleaning data
import unicodedata
import re

def clean_chunk(raw_html):
    # removing html code
    soup = BeautifulSoup(raw_html, "html.parser")
    text = soup.get_text(separator=" ")  # keeps sentence spacing more natural

    # normalising unicode
    text = unicodedata.normalize("NFKC", text)

    # removing urls and emails
    text = re.sub(r"http\S+|www\.\S+", "", text)
    text = re.sub(r"\S+@\S+", "", text)

    # fix bad spacing
    text = re.sub(r"\s+", " ", text)

    # fix punctuation
    text = re.sub(r"\.([A-Z])", r". \1", text)

    return text.strip()

In [None]:
# cleaning data
for i in range(len(small_chunks)):
    small_chunks[i] = clean_chunk(small_chunks[i])

In [None]:
#adding data to dataset
import json 
import os
if os.path.exists("dataset.json"):
    with open("dataset.json", "r", encoding="utf-8") as f:
        data = json.load(f)
else:
    data = []

data.extend(small_chunks)

with open("dataset.json", "w", encoding="utf-8") as f:
    json.dump(data, f, indent=2, ensure_ascii=False)