## KEYWORD IDENTIFICATION

In [None]:
import requests  # Python library to send HTTP requests.
from bs4 import BeautifulSoup
import re
from collections import Counter
import sqlite3
from typing import Dict, Iterable, Tuple

### We will first do scarping, cleaning and counting for the site.

In [48]:
def scrape_clean_count(url: str):

    #-------------------------------------------------------------------------------------------------------------------------------#
    #-------------------------------------------------------GETTING URL-------------------------------------------------------------#  
    #-------------------------------------------------------------------------------------------------------------------------------#  
    
    response = requests.get(url)
    # Check if request succeeded
    if response.status_code == 200: # This typically means done succesfully.
        html_content = response.text  # This is the raw HTML.
        print("Successfully fetched the webpage!")
    else:
        print("Failed to fetch the webpage. Status code:", response.status_code)
    # with the webpage taken in successfully, we have entire html script that can be checked by uncommeting below line.
    # print(html_content)



    #-------------------------------------------------------------------------------------------------------------------------------#
    #-------------------------------------------------GETTING, THINGS INSIDE VALID HTML TAGS----------------------------------------#  
    #-------------------------------------------------------------------------------------------------------------------------------#  

    # now we need to extract the text out from this html.
    # Parse the HTML content
    soup = BeautifulSoup(html_content, 'html.parser')
    # html
    # ├── head
    # │   └── title → 'IISER Mohali'
    # └── body
    #     ├── h1 → 'Welcome to IISER Mohali'
    #     ├── p  → 'Indian Institute...'
    #     └── p  → 'Established by...'
    # We are assuming that all text content is within these tags.
    # This is a very basic assumption and may not work for all websites.
    # Best way to check is to inspect the website and see where the text lies.
    # Extract text from common tags
    valid_tags = ['p', 'h1', 'h2', 'h3', 'h4', 'h5', 'h6', 'li'] # valid tags only from the html file, h->heading, p->paragraph, li->list
    text = " ".join([element.get_text() for element in soup.find_all(valid_tags)]) # join the entire string into one.
    # print(text) # uncomment this to check the text at this moment, you see everything in the associated tags. 



    #-------------------------------------------------------------------------------------------------------------------------------#
    #-------------------------------------CLEANING THE TEXT TO GET LOGICAL WORDS (TO SOME EXTENT)-----------------------------------#  
    #-------------------------------------------------------------------------------------------------------------------------------#  

    # As noted the text is not clean, it contains numbers, symbol, characters and everything, that may or my not be of physical significance.
    # but we logial words, and therefore we do the following:
    Clean_text = text.lower() #convert text to lower case.
    Clean_text =re.sub(r'[^a-z\s]', ' ',Clean_text) # remove all except alphabets and replace with space.
    Clean_text = re.sub(r'\b(?![ai]\b)[a-z]\b',' ',Clean_text) # remove all single letters except a and i, replace with space.
    Clean_text = re.sub(r'\s+',' ',Clean_text).strip() # remove multiple spaces to replace with single space.
    #print(Clean_text) # uncomment here to verify thism, we have the clean string of words from the site seperated by the space.



    #-------------------------------------------------------------------------------------------------------------------------------#
    #-------------------------------------------COUNTING AND GETTING THE FREQUENCY--------------------------------------------------#  
    #-------------------------------------------------------------------------------------------------------------------------------# 
    
    # Now we need to go through the string and get the unique words, and frequency (i.e. number of times the word appeared).
    Words = Clean_text.split() # splits the cleaned string over white spaces.
    Word_counter = Counter(Words) # gets frequency of words alongwith words to check uncomment below.
    #print (Word_counter) 
    # for observation properly uncomment below 
    #most_used = Word_counter.most_common() # top 20 or as needed by frequency.
    #for word,freq in most_used:
    #    print(f"{word:>12} : {freq}")
    #
    # Now we have words alongwith their frequency store in the Word_counter, which is the dictionary
    
    return # Word_counter # if need to visulaize


# to see how the function's output looks like
#scrape_clean_count("https://www.iisermohali.ac.in/")

### With word counting done, next we store data in SQLite3.