In [21]:
import requests
from bs4 import BeautifulSoup
import time

In [None]:
# gets links from a webpage
def get_site_links(base_url, common_part_of_url):
    links = []
    res = requests.get(base_url, headers ={
        "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) " +
                    "AppleWebKit/537.36 (KHTML, like Gecko) " +
                    "Chrome/114.0.0.0 Safari/537.36"
    })
    soup = BeautifulSoup(res.content, "html.parser")

    for a in soup.find_all("a", href=True):
        href = a["href"]
        if(href.startswith(common_part_of_url)):
            links.append(href)

    time.sleep(1)
    return list(set(links))

In [None]:
#extracts necessary info from webpage
def scrape_vox_article(url):
    res = requests.get(url, headers ={
    "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) " +
                  "AppleWebKit/537.36 (KHTML, like Gecko) " +
                  "Chrome/114.0.0.0 Safari/537.36"
    })
    soup = BeautifulSoup(res.content, "html.parser")

    # Try to find the content block
    article_divs = soup.find_all("div", class_ = "elementor-widget-container") #specific for vox iitk

    paragraphs = []
    for div in article_divs:
        for p in div.find_all("p"):
            text = p.get_text(strip=True)
            if text:  # skip empty
                paragraphs.append(text)

    content = "\n".join(paragraphs)
    return content

In [None]:
# extracting chunks of data
base_links = ['https://voxiitk.com/category/all-about-iitk/', 'https://voxiitk.com/category/all-about-iitk/page/2', 'https://voxiitk.com/category/all-about-iitk/page/3', 'https://voxiitk.com/category/all-about-iitk/page/4', 'https://voxiitk.com/category/all-about-iitk/page/5', 'https://voxiitk.com/category/editorials/', 'https://voxiitk.com/category/flagship-series/page/', 'https://voxiitk.com/category/flagship-series/page/2', 'https://voxiitk.com/category/flagship-series/page/3', 'https://voxiitk.com/category/flagship-series/page/4', 'https://voxiitk.com/category/flagship-series/page/5', 'https://voxiitk.com/category/flagship-series/page/6', 'https://voxiitk.com/category/reports-and-investigations/page/', 'https://voxiitk.com/category/reports-and-investigations/page/2', 'https://voxiitk.com/category/surveys/', 'https://voxiitk.com/category/beyond-iitk/', 'https://voxiitk.com/category/flagship-series/iitk-101/']

big_chunks = []
for base_link in base_links:
    links = get_site_links(base_link, 'https://voxiitk.com')
    for url in links:
        big_chunks.append(scrape_vox_article(url))

In [72]:
print(len(big_chunks))

857


In [None]:
# chunkifying function which shortens data chunks to processable ones 
import nltk
nltk.download('punkt')
nltk.download('punkt_tab')
from nltk.tokenize import sent_tokenize, word_tokenize

def chunk_text(text, max_words=400, overlap=100):
    sentences = sent_tokenize(text)
    
    chunks = []
    current_chunk = []
    current_word_count = 0

    i = 0
    while i < len(sentences):
        sentence = sentences[i]
        word_count = len(word_tokenize(sentence))

        if current_word_count + word_count <= max_words:
            current_chunk.append(sentence)
            current_word_count += word_count
            i += 1
        else:
            # Save chunk
            chunk_text = ' '.join(current_chunk).strip()
            chunks.append(chunk_text)

            # Move back overlap words
            if overlap > 0:
                backtrack_words = 0
                backtrack_index = len(current_chunk) - 1
                while backtrack_index >= 0 and backtrack_words < overlap:
                    backtrack_words += len(word_tokenize(current_chunk[backtrack_index]))
                    backtrack_index -= 1
                current_chunk = current_chunk[backtrack_index+1:]
                current_word_count = sum(len(word_tokenize(s)) for s in current_chunk)
            else:
                current_chunk = []
                current_word_count = 0

    # Save any remaining chunk
    if current_chunk:
        chunks.append(' '.join(current_chunk).strip())

    return chunks


[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\shilp\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package punkt_tab to
[nltk_data]     C:\Users\shilp\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!


In [None]:
# chunking data
from tqdm import tqdm

small_chunks = []
for chonkies in tqdm(big_chunks, desc="Chunkifying"):
    for small_chonkies in chunk_text(chonkies):
        small_chunks.append(small_chonkies)

print(small_chunks[:5])

Chunkifying: 100%|██████████| 857/857 [00:10<00:00, 84.50it/s] 

['Disclaimer: Vox Populi, IIT Kanpur, is the exclusive owner of the information on this website. No part of this content may be duplicated, paraphrased, or interpreted in any other way without written consent from Vox Populi. If you want to reproduce any of the content on this page, please contact our chief editors directly or reach out to us by email atvoxpopuli@iitk.ac.in. On February 19, 2025, the Council of Students for Hostel Affairs (CoSHA) held its second emergency meeting in the Senate Hall to discuss the recent eviction notice served to the Dhobis, whose deadline to vacate was set for March 6. This meeting was attended by Hall Presidents, Gymkhana representatives and a number of GBMs. According to the published minutes, “The central theme of the discussion was that the students, being the majority stakeholders of the service, were never consulted before the eviction notices were served to the Dhobis.”\nThe institute’s long-term modernization plan was a topic of discussion. The




In [74]:
print(len(small_chunks))

3113


In [None]:
#cleaning function
import unicodedata
import re

def clean_chunk(raw_html):
    # removing html code
    soup = BeautifulSoup(raw_html, "html.parser")
    text = soup.get_text(separator=" ") 

    # normalising unicode
    text = unicodedata.normalize("NFKC", text)

    # removing urls and emails
    text = re.sub(r"http\S+|www\.\S+", "", text)
    text = re.sub(r"\S+@\S+", "", text)

    # fix bad spacing
    text = re.sub(r"\s+", " ", text)

    # fix punctuation
    text = re.sub(r"\.([A-Z])", r". \1", text)

    return text.strip()

In [None]:
#cleaning data
for i in range(len(small_chunks)):
    small_chunks[i] = clean_chunk(small_chunks[i])

In [None]:
# adding data to dataset
import json 
import os
if os.path.exists("dataset.json"):
    with open("dataset.json", "r", encoding="utf-8") as f:
        data = json.load(f)
else:
    data = []

data.extend(small_chunks)

with open("dataset.json", "w", encoding="utf-8") as f:
    json.dump(data, f, indent=2, ensure_ascii=False)