## KEYWORD IDENTIFICATION

In [7]:
import requests  # Python library to send HTTP requests.
from bs4 import BeautifulSoup
import re
from collections import Counter
import sqlite3
from typing import Dict, Iterable, Tuple

### We will first do scarping, cleaning and counting for the site.

In [8]:
def scrape_clean_count(url: str):

    #-------------------------------------------------------------------------------------------------------------------------------#
    #-------------------------------------------------------GETTING URL-------------------------------------------------------------#  
    #-------------------------------------------------------------------------------------------------------------------------------#  
    
    response = requests.get(url)
    # Check if request succeeded
    if response.status_code == 200: # This typically means done succesfully.
        html_content = response.text  # This is the raw HTML.
        print("Successfully fetched the webpage!")
    else:
        print("Failed to fetch the webpage. Status code:", response.status_code)
    # with the webpage taken in successfully, we have entire html script that can be checked by uncommeting below line.
    # print(html_content)



    #-------------------------------------------------------------------------------------------------------------------------------#
    #-------------------------------------------------GETTING, THINGS INSIDE VALID HTML TAGS----------------------------------------#  
    #-------------------------------------------------------------------------------------------------------------------------------#  

    # now we need to extract the text out from this html.
    # Parse the HTML content
    soup = BeautifulSoup(html_content, 'html.parser')
    # html
    # ├── head
    # │   └── title → 'IISER Mohali'
    # └── body
    #     ├── h1 → 'Welcome to IISER Mohali'
    #     ├── p  → 'Indian Institute...'
    #     └── p  → 'Established by...'
    # We are assuming that all text content is within these tags.
    # This is a very basic assumption and may not work for all websites.
    # Best way to check is to inspect the website and see where the text lies.
    # Extract text from common tags
    valid_tags = ['p', 'h1', 'h2', 'h3', 'h4', 'h5', 'h6', 'li'] # valid tags only from the html file, h->heading, p->paragraph, li->list
    text = " ".join([element.get_text() for element in soup.find_all(valid_tags)]) # join the entire string into one.
    # print(text) # uncomment this to check the text at this moment, you see everything in the associated tags. 



    #-------------------------------------------------------------------------------------------------------------------------------#
    #-------------------------------------CLEANING THE TEXT TO GET LOGICAL WORDS (TO SOME EXTENT)-----------------------------------#  
    #-------------------------------------------------------------------------------------------------------------------------------#  

    # As noted the text is not clean, it contains numbers, symbol, characters and everything, that may or my not be of physical significance.
    # but we logial words, and therefore we do the following:
    Clean_text = text.lower() #convert text to lower case.
    Clean_text =re.sub(r'[^a-z\s]', ' ',Clean_text) # remove all except alphabets and replace with space.
    Clean_text = re.sub(r'\b(?![ai]\b)[a-z]\b',' ',Clean_text) # remove all single letters except a and i, replace with space.
    Clean_text = re.sub(r'\s+',' ',Clean_text).strip() # remove multiple spaces to replace with single space.
    #print(Clean_text) # uncomment here to verify thism, we have the clean string of words from the site seperated by the space.



    #-------------------------------------------------------------------------------------------------------------------------------#
    #-------------------------------------------COUNTING AND GETTING THE FREQUENCY--------------------------------------------------#  
    #-------------------------------------------------------------------------------------------------------------------------------# 
    
    # Now we need to go through the string and get the unique words, and frequency (i.e. number of times the word appeared).
    Words = Clean_text.split() # splits the cleaned string over white spaces.
    Word_counter = Counter(Words) # gets frequency of words alongwith words to check uncomment below.
    #print (Word_counter) 
    # for observation properly uncomment below 
    #most_used = Word_counter.most_common() # top 20 or as needed by frequency.
    #for word,freq in most_used:
    #    print(f"{word:>12} : {freq}")
    #
    # Now we have words alongwith their frequency store in the Word_counter, which is the dictionary
    
    return Word_counter # for further analysis


# to see how the function's output looks like
#scrape_clean_count("https://www.iisermohali.ac.in/")

### With word counting done, next we store data in SQLite3.

### Now we database functions and tables defined properly. 
#### Next we do is analysis on it and visulaization.

In [None]:
# SQLite is a self-contained database that stores data in a single file (like data.db).
# You don’t need to run a separate database server, it is built into Python and perfect for “store plus query later” tasks.
# In your case, you want to store:
# each URL,
# each word from that URL, and
# the frequency of that word.

# Database	A single file that holds tables (e.g., keywords.db)
# Table	    A sheet-like structure (e.g., one table for word counts)
# Row	    A single record (e.g., one word’s count for one URL)
# Column	A field in each row (e.g., URL, word, frequency)
# Cursor	A helper that executes SQL commands and returns results
# SQL	    The language we use to create, insert, and read data


#-------------------------------------------------------------------------------------------------------------------------------#
#-------------------------------------------Create or Connect to DataBase-------------------------------------------------------#  
#-------------------------------------------------------------------------------------------------------------------------------# 
def init_db(db_path: str = "keywords.db"):
    # Make the data base. Or get connected to the database.
    conn = sqlite3.connect("keywords.db")

    # To work in it we have cursor. (" way of talking to the database ")
    cur = conn.cursor()

    # Next up tell data base what to do
    # CREATE TABLE  → make a new table.
    # IF NOT EXISTS → don’t complain if it’s already there.
    # The parentheses define columns:
    # id       : unique number auto-generated for each row.
    # url      : text (string).
    # word     : text.
    # frequency: integer.
    # UNIQUE   : prevents repetition.
    conn.execute("""
    CREATE TABLE IF NOT EXISTS word_counts (
        id INTEGER PRIMARY KEY AUTOINCREMENT,
        url TEXT NOT NULL,
        word TEXT NOT NULL,
        frequency INTEGER,
        UNIQUE(url, word) 
    )
    """)

    # Build an internal quick-lookup structure (called idx_url) so queries filtering by the url column are faster.
    conn.execute("CREATE INDEX IF NOT EXISTS idx_url ON word_counts(url)")
    # It speeds up queries like “find where word = 'research'.”
    conn.execute("CREATE INDEX IF NOT EXISTS idx_word ON word_counts(word)")

    # Save the changes you have made.
    conn.commit()   
    return conn



#-------------------------------------------------------------------------------------------------------------------------------#
#-------------------------------------------Write and Save in the Database------------------------------------------------------#  
#-------------------------------------------------------------------------------------------------------------------------------# 
def save_counter_overwrite(conn: sqlite3.Connection, url: str, word_counter: Counter):
    """
    Save all (word, freq) pairs for `url`. If a row for (url,word) already exists,
    overwrite its frequency with the new value (your chosen behavior).
    Uses executemany for speed and runs inside a single transaction.
    """
    # first  ? → url,
    # second ? → word,
    # third  ? → freq.
    # So the code is really saying:
    # Insert a new row into the word_counts table, filling the columns with these values in order.
    # If pair (url, word) is repeated we replace it the latest one.
    sql = """
    INSERT INTO word_counts (url, word, frequency)
    VALUES (?, ?, ?)
    ON CONFLICT(url, word) DO UPDATE
      SET frequency = excluded.frequency
    """
    # If you have many (url, word, freq) rows (like your whole Counter), calling execute() one-by-one is slow.
    # executemany().
    # Make a list.
    data = [(url, word, int(freq)) for word, freq in word_counter.items()]
    cur = conn.cursor()
    cur.executemany(sql, data)
    conn.commit()



#-------------------------------------------------------------------------------------------------------------------------------#
#-----------------------------------Get words for url in "Word : Frequency" format----------------------------------------------#  
#-------------------------------------------------------------------------------------------------------------------------------# 
def get_words_for_url(conn: sqlite3.Connection, url: str, limit: int = 1000):
    """
    Return top words for a URL ordered by frequency descending.
    """
    sql = "SELECT word, frequency FROM word_counts WHERE url = ? ORDER BY frequency DESC LIMIT ?"
    cur = conn.execute(sql, (url, limit))
    return cur.fetchall()



#-------------------------------------------------------------------------------------------------------------------------------#
#---------------------------------Get top words for url in "Word : Frequency" format--------------------------------------------#  
#-------------------------------------------------------------------------------------------------------------------------------#
def get_top_words_overall(conn: sqlite3.Connection, limit: int = 1000):
    """
    Return the most frequent words across all URLs combined (sums frequencies).
    """
    sql = """
    SELECT word, SUM(frequency) as total_freq
    FROM word_counts
    GROUP BY word
    ORDER BY total_freq DESC
    LIMIT ?
    """
    cur = conn.execute(sql, (limit,))
    return cur.fetchall()
  

### Using the functions to create, read, write, and analyse the database."

In [None]:
#-------------------------------------------------------------------------------------------------------------------------------#
#------------------------------------------------------------USAGE--------------------------------------------------------------#  
#-------------------------------------------------------------------------------------------------------------------------------#
if __name__ == "__main__":
    # quick demo
    url1 = "https://www.iisermohali.ac.in/"
    conn = init_db("example_keywords.db")
    example_counter = scrape_clean_count(url1)
    save_counter_overwrite(conn, url1, example_counter)

    url2 = "https://www.iiserpune.ac.in/"
    conn = init_db("example_keywords.db")
    example_counter = scrape_clean_count(url2)
    save_counter_overwrite(conn, url2, example_counter)

    print("Top words for IISER Mohali:")
    for word, freq in get_words_for_url(conn, url1):
        # if freq >= 5:
            print(word, freq)
        # else:
        #     break

    print("Top words for IISER Pune:")
    for word, freq in get_words_for_url(conn, url2):
        #if freq >= 5:
            print(word, freq)
        #else:
        #    break

    print("\nTop words overall:")
    for word, total in get_top_words_overall(conn, limit=10):
        print(word, total)

    print("\nTop common words:")
    for word1, freq1 in get_words_for_url(conn, url1):
        for word2, freq2 in get_words_for_url(conn, url2):
            if word1 == word2:
                print(word1, freq1, freq2)
            else:
                continue

    #print("\nTop uncommon words:")
    #for word1, freq1 in get_words_for_url(conn, url1):
    #    x=0
    #    for word2, freq2 in get_words_for_url(conn, url2):
    #        if word1!=word2:
    #            x+=1
    #        else:
    #            break
            

    conn.close()
