somesh code :

In [None]:
# Complete end-to-end: scrape -> clean -> count -> store -> compare
# Requirements: requests, beautifulsoup4
# Run: pip install requests beautifulsoup4  (if not installed)

import requests
from bs4 import BeautifulSoup
import re
import sqlite3
from collections import Counter
from typing import List, Tuple

# -------------------------
# Scrape + clean functions
# -------------------------
def clean_text(text: str) -> str:
    """
    Convert to lowercase, remove non-letters except spaces,
    remove stray single letters except 'a' and 'i', normalize whitespace.
    """
    text = text.lower()
    text = re.sub(r'[^a-z\s]', ' ', text)               # keep only a-z and whitespace
    text = re.sub(r'\b(?![ai]\b)[a-z]\b', ' ', text)    # remove single letters except 'a' and 'i'
    text = re.sub(r'\s+', ' ', text).strip()            # collapse spaces
    return text

def scrape_text_from_url(url: str, valid_tags=None, timeout=12) -> str:
    """
    Fetch URL, parse HTML, extract text from valid_tags, then clean it.
    Returns cleaned text string.
    """
    if valid_tags is None:
        valid_tags = ['p', 'h1', 'h2', 'h3', 'h4', 'h5', 'h6', 'li']

    response = requests.get(url, timeout=timeout)
    if response.status_code != 200:
        raise RuntimeError(f"Failed to fetch {url} (status {response.status_code})")
    soup = BeautifulSoup(response.text, 'html.parser')
    pieces = [el.get_text(separator=' ', strip=True) for el in soup.find_all(valid_tags)]
    raw_text = " ".join(pieces)
    return clean_text(raw_text)

# -------------------------
# Counting function
# -------------------------
def count_words(cleaned_text: str) -> Counter:
    words = cleaned_text.split()
    return Counter(words)

# -------------------------
# SQLite storage functions (single table for all sites)
# -------------------------
def store_site_word_counts(
    word_counts: Counter,
    site: str,
    db_path: str = "words.db",
    table: str = "word_frequency",
    accumulate: bool = True
) -> None:
    """
    Store counts for a given site into SQLite.
    - accumulate=True : add new counts to existing (useful for incremental scraping)
    - accumulate=False: replace stored counts for that site (snapshot)
    """
    conn = sqlite3.connect(db_path)
    cur = conn.cursor()

    # Create table holding rows for many sites (site, word, frequency)
    cur.execute(f"""
        CREATE TABLE IF NOT EXISTS {table} (
            id INTEGER PRIMARY KEY AUTOINCREMENT,
            site TEXT NOT NULL,
            word TEXT NOT NULL,
            frequency INTEGER NOT NULL,
            UNIQUE(site, word)
        )
    """)

    if accumulate:
        # add new counts to existing frequency
        upsert_sql = f"""
            INSERT INTO {table} (site, word, frequency)
            VALUES (?, ?, ?)
            ON CONFLICT(site, word) DO UPDATE
              SET frequency = {table}.frequency + excluded.frequency
        """
    else:
        # replace stored frequency with the new one
        upsert_sql = f"""
            INSERT INTO {table} (site, word, frequency)
            VALUES (?, ?, ?)
            ON CONFLICT(site, word) DO UPDATE
              SET frequency = excluded.frequency
        """

    for w, freq in word_counts.items():
        cur.execute(upsert_sql, (site, w, int(freq)))

    conn.commit()
    conn.close()

def read_top_by_site(
    site: str,
    db_path: str = "words.db",
    table: str = "word_frequency",
    top: int = 20
) -> List[Tuple[str, int]]:
    conn = sqlite3.connect(db_path)
    cur = conn.cursor()
    cur.execute(f"SELECT word, frequency FROM {table} WHERE site = ? ORDER BY frequency DESC LIMIT ?", (site, top))
    rows = cur.fetchall()
    conn.close()
    return rows

def compare_sites_common(
    site1: str,
    site2: str,
    db_path: str = "words.db",
    table: str = "word_frequency",
    top: int = 30,
    sort_by: str = "sum"  # "sum" | "min" | "diff"
) -> List[Tuple[str, int, int]]:
    """
    Return common words between site1 and site2 as (word, freq_site1, freq_site2).
    sort_by:
      - 'sum' : sort by freq1+freq2 descending (default)
      - 'min' : sort by min(freq1,freq2) descending -> words both strongly used
      - 'diff': sort by absolute difference descending -> most differently-used words
    """
    conn = sqlite3.connect(db_path)
    cur = conn.cursor()
    sql = f"""
        SELECT a.word, a.frequency AS f1, b.frequency AS f2
        FROM {table} a
        JOIN {table} b ON a.word = b.word
        WHERE a.site = ? AND b.site = ?
    """
    cur.execute(sql, (site1, site2))
    rows = cur.fetchall()
    conn.close()

    if sort_by == "sum":
        rows_sorted = sorted(rows, key=lambda r: (r[1] + r[2]), reverse=True)
    elif sort_by == "min":
        rows_sorted = sorted(rows, key=lambda r: min(r[1], r[2]), reverse=True)
    elif sort_by == "diff":
        rows_sorted = sorted(rows, key=lambda r: abs(r[1] - r[2]), reverse=True)
    else:
        rows_sorted = rows

    return rows_sorted[:top]

def unique_to_site(
    site1: str,
    site2: str,
    db_path: str = "words.db",
    table: str = "word_frequency",
    top: int = 50
) -> List[Tuple[str, int]]:
    """
    Words present in site1 but NOT in site2 (ordered by frequency in site1).
    """
    conn = sqlite3.connect(db_path)
    cur = conn.cursor()
    sql = f"""
        SELECT a.word, a.frequency
        FROM {table} a
        LEFT JOIN {table} b ON a.word = b.word AND b.site = ?
        WHERE a.site = ? AND b.word IS NULL
        ORDER BY a.frequency DESC
        LIMIT ?
    """
    cur.execute(sql, (site2, site1, top))
    rows = cur.fetchall()
    conn.close()
    return rows

# -------------------------
# Example end-to-end usage
# -------------------------
if __name__ == "__main__":
    # Example 1: single site (IISER Mohali)
    mohali_url = "https://www.iisermohali.ac.in/"
    print("Scraping:", mohali_url)
    cleaned_mohali_text = scrape_text_from_url(mohali_url)
    word_counts_mohali = count_words(cleaned_mohali_text)
    print("Unique words (Mohali):", len(word_counts_mohali))
    print("Top 10 (Mohali):", word_counts_mohali.most_common(10))

    # Store in DB (accumulate=True => add counts if site already present)
    store_site_word_counts(word_counts_mohali, site=mohali_url, accumulate=True)
    print("Stored Mohali counts in DB.")

    # Example 2: another site (IISER Pune) - optional
    pune_url = "https://www.iiserpune.ac.in/"
    print("\nScraping:", pune_url)
    cleaned_pune_text = scrape_text_from_url(pune_url)
    word_counts_pune = count_words(cleaned_pune_text)
    store_site_word_counts(word_counts_pune, site=pune_url, accumulate=True)
    print("Stored Pune counts in DB.")

    # Read and display top words for each site
    print("\nTop words - Mohali:")
    for w, f in read_top_by_site(mohali_url, top=20):
        print(f"{w:>15}  {f}")
    print("\nTop words - Pune:")
    for w, f in read_top_by_site(pune_url, top=20):
        print(f"{w:>15}  {f}")

    # Compare common words
    print("\nTop common words (by sum of frequencies):")
    for word, f1, f2 in compare_sites_common(mohali_url, pune_url, top=25, sort_by="sum"):
        print(f"{word:>15}  Mohali:{f1:5d}  Pune:{f2:5d}")

    # Words unique to Mohali (not in Pune)
    print("\nTop words unique to Mohali (not in Pune):")
    for w, f in unique_to_site(mohali_url, pune_url, top=30):
        print(f"{w:>15}  {f}")


mine(with chat gpt help ofcourse)

In [None]:
"""
Full Program: Web Scraper + Text Cleaner + YAKE Keyword Extraction + SQLite Storage
-----------------------------------------------------------------------------------
This program:
1. Scrapes text from a given website (using requests + BeautifulSoup).
2. Cleans the text (removes noise, normalizes case and spacing).
3. Extracts important keywords/phrases using YAKE.
4. Stores results into a SQLite database.
5. Allows comparison between sites by keyword importance.
"""

import requests
from bs4 import BeautifulSoup
import re
import sqlite3
import yake
from typing import List, Tuple

# =========================================================
# STEP 1: Text cleaning
# =========================================================
def clean_text(text: str) -> str:
    """
    Convert to lowercase, remove non-alphabetic characters (except spaces),
    and normalize whitespace.
    """
    text = text.lower()                          # Normalize case
    text = re.sub(r'[^a-z\s]', ' ', text)        # Keep only letters and spaces
    text = re.sub(r'\s+', ' ', text).strip()     # Collapse multiple spaces
    return text


# =========================================================
# STEP 2: Scraping text from a webpage
# =========================================================
def scrape_text_from_url(url: str, valid_tags=None, timeout: int = 10) -> str:
    """
    Fetch webpage content, extract readable text, and clean it.
    """
    if valid_tags is None:
        valid_tags = ['p', 'h1', 'h2', 'h3', 'h4', 'li']

    response = requests.get(url, timeout=timeout)
    if response.status_code != 200:
        raise RuntimeError(f"Failed to fetch {url} (status {response.status_code})")

    soup = BeautifulSoup(response.text, 'html.parser')

    # Extract text only from specific HTML tags
    pieces = [el.get_text(separator=' ', strip=True) for el in soup.find_all(valid_tags)]
    raw_text = " ".join(pieces)

    # Clean and return
    return clean_text(raw_text)


# =========================================================
# STEP 3: YAKE Keyword Extraction
# =========================================================
def extract_keywords_yake(text: str, max_keywords: int = 20) -> List[Tuple[str, float]]:
    """
    Extract top keywords using YAKE.
    Lower score => more relevant keyword.
    """
    # Configure YAKE parameters
    kw_extractor = yake.KeywordExtractor(
        lan="en",              # language
        n=1,                   # max n-gram size (1 = single words, 3 = up to trigrams)
        dedupLim=0.9,          # threshold for merging similar words
        top=max_keywords,      # number of keywords to extract
        features=None          # use default YAKE features
    )

    keywords = kw_extractor.extract_keywords(text)
    return keywords  # Returns list of (keyword, score)


# =========================================================
# STEP 4: SQLite Storage
# =========================================================
def store_keywords(
    keywords: List[Tuple[str, float]],
    site: str,
    db_path: str = "keywords.db",
    table: str = "keyword_scores"
) -> None:
    """
    Store YAKE keywords and scores for a site into SQLite database.
    """
    conn = sqlite3.connect(db_path)
    cur = conn.cursor()

    # Create table if it doesn't exist
    cur.execute(f"""
        CREATE TABLE IF NOT EXISTS {table} (
            id INTEGER PRIMARY KEY AUTOINCREMENT,
            site TEXT NOT NULL,
            keyword TEXT NOT NULL,
            score REAL NOT NULL,
            UNIQUE(site, keyword)
        )
    """)

    # Insert or update existing entries
    for kw, score in keywords:
        cur.execute(f"""
            INSERT INTO {table} (site, keyword, score)
            VALUES (?, ?, ?)
            ON CONFLICT(site, keyword)
            DO UPDATE SET score = excluded.score
        """, (site, kw, score))

    conn.commit()
    conn.close()


# =========================================================
# STEP 5: Read keywords from DB
# =========================================================
def read_top_keywords(site: str, db_path: str = "keywords.db", table: str = "keyword_scores", top: int = 10):
    """
    Fetch top N keywords (lowest YAKE score = most important).
    """
    conn = sqlite3.connect(db_path)
    cur = conn.cursor()
    cur.execute(f"""
        SELECT keyword, score FROM {table}
        WHERE site = ?
        ORDER BY score ASC
        LIMIT ?
    """, (site, top))
    rows = cur.fetchall()
    conn.close()
    return rows


# =========================================================
# STEP 6: Example run (compare two IISER sites)
# =========================================================
if __name__ == "__main__":
    # Example sites
    mohali_url = "https://www.iisermohali.ac.in/"
    pune_url   = "https://www.iiserpune.ac.in/"

    for site in [mohali_url, pune_url]:
        print(f"\n--- Processing {site} ---")

        # Step 1: Scrape + Clean
        text = scrape_text_from_url(site)
        print(f"Scraped {len(text.split())} words from {site}")

        # Step 2: Extract keywords using YAKE
        keywords = extract_keywords_yake(text, max_keywords=20)

        # Step 3: Store into SQLite
        store_keywords(keywords, site)

        # Step 4: Show top keywords
        print("Top 10 Keywords:")
        for kw, score in read_top_keywords(site, top=10):
            print(f"  {kw:30}  score={score:.6f}")

    print("\n✅ YAKE keyword extraction completed and stored in database.")
