In [None]:
import requests  # Python library to send HTTP requests.

url = "https://www.iisermohali.ac.in/"
#url= "https://timesofindia.indiatimes.com/"

response = requests.get(url)

# Check if request succeeded
if response.status_code == 200:
    html_content = response.text  # This is the raw HTML
    print("Successfully fetched the webpage!")
else:
    print("Failed to fetch the webpage. Status code:", response.status_code)


Successfully fetched the webpage!


In [4]:
from bs4 import BeautifulSoup

# Parse the HTML content
soup = BeautifulSoup(html_content, 'html.parser')
# html
# ├── head
# │   └── title → 'IISER Mohali'
# └── body
#     ├── h1 → 'Welcome to IISER Mohali'
#     ├── p  → 'Indian Institute...'
#     └── p  → 'Established by...'
# We are assuming that all text content is within these tags.
# This is a very basic assumption and may not work for all websites.
# Best way to check is to inspect the website and see where the text lies.
# Extract text from common tags
valid_tags = ['p', 'h1', 'h2', 'h3', 'h4', 'h5', 'h6', 'li'] # valid tags only from the html file, h->heading, p->paragraph, li->list
text = " ".join([element.get_text() for element in soup.find_all(valid_tags)]) # join the entire string into one.

print(text[:])  # print characters


 IISER MOHALI Institute Academics Research People Outreach 79th Independence Day Celebration at IISER Mohali International Yoga Day Celebration at IISER Mohali IISER Mohali awarded degrees to 316 Graduating Students IISER Mohali Science Festival - Inaugurated by Hon'ble Education Minister of Punjab Science demonstrations & competitions The Hon'ble Vice-President of India interacts with IISER Mohali Faculty, Students & Staff Visit of Hon'ble Vice-President of India to IISER Mohali Tattva 2025 Science Fest - Celebrating the essence of Scientific Curiosity  76th Republic Day Celebration at IISER Mohali 500+ School students at IISER Mohali for National Space Day Annual Science Festival Lectures by Dr. Anjan Ray & Dr. Srivari Chandrashekhar Students awarded for outstanding performance in Academics & Sports FACULTY ACHIEVEMENTS PROFESSOR ANIL KUMAR TRIPATHI Director, IISER Mohali Elected as Vice President (Science Promotion) of INSA  1800 school students across tricity visited IISER Mohali  

Now after getting the 'text' part , our next step is to clean the text we get after parsing.
This includes getting rid of symbols , punctuations and numbers and converting all the text to lower case, so that we can standardize the text for word frequency analysis . 


In [None]:
import re

def clean(text:str) -> str:

   text = text.lower() #convert text to lower case

   text =re.sub(r'[^a-z\s]', ' ',text)

   text = re.sub(r'\b(?![ai]\b)[a-z]\b',' ',text)

   text = re.sub(r'\s+',' ',text).strip()

   return text

clean(text)

cleaned_text = clean(text)


##### Now we will break the string into single words and count the frequencies.
##### That is, tokenize, count and store.

In [None]:
from collections import Counter

def count_words(cleaned_text: str) -> Counter:
    """
    Takes a cleaned text string (only lowercase letters and spaces),
    returns a Counter mapping word -> frequency.
    """

    words = cleaned_text.split() # splits the cleaned string over white spaces

    word_counter = Counter(words) # gets frequency of words   

    return word_counter

freq = count_words(cleaned_text)
most_used = freq.most_common()

print("Top 10 most used words:")
for word,freq in most_used:
    print(f"{word:>12} : {freq}")


Top 10 most used words:
       iiser : 15
      mohali : 14
          of : 8
     science : 6
    sciences : 6
   committee : 6
    research : 5
         day : 5
    students : 5
   institute : 4
          at : 4
     faculty : 4
        news : 4
   academics : 3
 celebration : 3
          by : 3
         hon : 3
         ble : 3
         the : 3
        vice : 3
   president : 3
      events : 3
      policy : 3
  facilities : 3
  committees : 3
    calendar : 3
      people : 2
    outreach : 2
          th : 2
     awarded : 2
          to : 2
    festival : 2
       india : 2
        with : 2
      school : 2
         for : 2
    lectures : 2
          dr : 2
          in : 2
      sports : 2
achievements : 2
 departments : 2
     website : 2
   copyright : 2
          us : 2
      campus : 2
    computer : 2
      center : 2
      hostel : 2
        nirf : 2
         icc : 2
    internal : 2
  complaints : 2
       rules : 2
  institutes : 2
      system : 2
     careers : 2
     