In [1]:
from bs4 import BeautifulSoup
from sklearn.feature_extraction.text import CountVectorizer

In [2]:
from sklearn.decomposition import LatentDirichletAllocation

In [7]:
import pandas as pd
import re

In [8]:
files = ["The Indian Express/TheIndianExpress.html", "The Economic Times/TheEconomicTimes.html", "TIE/TIE.html"]
def extract(fp):
    with open(fp,'r',encoding='utf-8') as f:
        soup=BeautifulSoup(f.read(),'html.parser')
        for tag in soup(['script','style']):
            tag.decompose()
    txt=" ".join(p.get_text() for p in soup.find_all('p'))
    txt=re.sub(r"https?://\S+|www\.\S+"," ",txt)
    txt=re.sub(r"//\s+"," ",txt).strip()
    return txt

In [13]:
texts=[ extract(p) for p in files if len(extract(p))>=50]
vectorizer=CountVectorizer(stop_words='english',max_df=0.95,min_df=2)
x=vectorizer.fit_transform(texts)
terms=vectorizer.get_feature_names_out()


In [16]:
score=x.sum(axis=0).A1
top_idx=score.argsort()[::-1][:15]
for i,topic in enumerate(top_idx):
    print(f" {terms[topic]:.20s} {score[topic]:.1f}")

 express 30.0
 window 28.0
 opens 28.0
 august 21.0
 arrivals 17.0
 score 13.0
 research 12.0
 cricket 11.0
 visa 11.0
 shorts 10.0
 continues 10.0
 screen 10.0
 wi 10.0
 advertisement 10.0
 stem 9.0


In [19]:
k=min(6,len(texts))
lda=LatentDirichletAllocation(n_components=k,random_state=42)
lda.fit(x)
for i,topic in enumerate(lda.components_):
    ll=topic.argsort()[::-1][:10]
    print(f" topic:{i+1} :{"| ".join(terms[ll])}")

 topic:1 :companies| united| visa| costs| states| prestige| emerging| remains| cost| tuition
 topic:2 :driven| director| fully| finance| ambition| living| ivy| investment| labs| contrast
 topic:3 :express| window| opens| august| arrivals| score| cricket| advertisement| screen| shorts


In [20]:
from bs4 import BeautifulSoup
import matplotlib.pyplot as plt
from collections import Counter
import re
import math
import html
import os

FILES = [r"C:\BIDA LAB\EXP 7\r_news.htm"]

LEX = {
    "good": 2, "great": 3, "excellent": 4, "amazing": 4, "love": 3, "like": 2,
    "nice": 2, "awesome": 4, "helpful": 2, "bad": -2, "terrible": -3, "awful": -3,
    "hate": -3, "slow": -2, "buggy": -3, "confusing": -2, "broken": -3, "issue": -2,
    "problem": -2, "worst": -4, "disappointed": -3, "frustrating": -3
}

NEG = {"not", "no", "never", "none", "hardly", "barely", "scarcely"}

WORD_RE = re.compile(r"[A-Za-z][A-Za-z\-']+")
URL_RE = re.compile(r"https?://\S+")
EMOJI_RE = re.compile(r"[\U00010000-\U0010ffff]")

def clean(text):
    text = html.unescape(text or "")
    text = URL_RE.sub(" ", text)
    text = EMOJI_RE.sub("", text)
    text = re.sub(r"\s+", " ", text).strip()
    return text

def tokenize(text):
    return [w.lower() for w in WORD_RE.findall(text)]

def score(text):
    words = tokenize(text)
    total = 0.0
    for i, w in enumerate(words):
        val = LEX.get(w, 0)
        if val:
            if any(words[i-j] in NEG for j in range(1, min(3, i) + 1)):
                val *= -1
            total += val
    return total / max(1.0, math.log(len(words) + 1, 3))

def grab_comments(html_doc):
    soup = BeautifulSoup(html_doc, "html.parser")
    for t in soup(["script", "style", "noscript", "iframe", "svg"]):
        t.decompose()
    cands = soup.find_all(attrs={"class": re.compile("comment|reply", re.I)})
    cands += soup.find_all(id=re.compile("comment|reply", re.I))
    texts = []
    for el in set(cands):
        txt = clean(el.get_text(" ", strip=True))
        if len(txt) > 12:
            texts.append(txt)
    return list(dict.fromkeys(texts))

comments = []
for path in FILES:
    if os.path.exists(path):
        with open(path, encoding="utf-8", errors="ignore") as f:
            comments += grab_comments(f.read())

comments = list(dict.fromkeys(comments))
if not comments:
    raise SystemExit("No comments found. Save a page with comments and set FILES.")

scores = [score(c) for c in comments]
labels = []
for s in scores:
    if s >= 0.05:
        labels.append("positive")
    elif s <= -0.05:
        labels.append("negative")
    else:
        labels.append("neutral")

cnt = Counter(labels)
overall = max(cnt, key=cnt.get)
print("Overall tone:", overall)
print("Counts:", dict(cnt))

order = ["positive", "neutral", "negative"]
vals = [cnt.get(k, 0) for k in order]
plt.bar(order, vals)
plt.title("Sentiment Distribution")
plt.ylabel("Count")
plt.tight_layout()
plt.show()


SystemExit: No comments found. Save a page with comments and set FILES.

  warn("To exit: use 'exit', 'quit', or Ctrl-D.", stacklevel=1)


In [23]:
# EXP7 - HTML Sentiment Analysis


from bs4 import BeautifulSoup
import re
from collections import Counter

# Sentiment lexicon
positive_words = {'good', 'great', 'excellent', 'amazing', 'love', 'like', 'nice', 'awesome', 'helpful', 'fantastic', 'wonderful', 'brilliant', 'outstanding', 'perfect', 'best', 'better', 'super', 'cool'}
negative_words = {'bad', 'terrible', 'awful', 'hate', 'slow', 'buggy', 'confusing', 'broken', 'issue', 'problem', 'worst', 'disappointed', 'frustrating', 'annoying', 'horrible', 'disgusting', 'pathetic', 'useless', 'worthless', 'stupid', 'dumb', 'idiotic', 'ridiculous', 'absurd', 'nonsense', 'garbage', 'trash'}
negation_words = {'not', 'no', 'never', 'none', 'hardly', 'barely', 'scarcely'}

def extract_comments(html_file):
    with open(html_file, 'r', encoding='utf-8') as f:
        soup = BeautifulSoup(f.read(), 'html.parser')
    
    # Remove scripts and styles
    for tag in soup(["script", "style", "noscript", "iframe", "svg"]):
        tag.decompose()
    
    # Find comment elements
    comments = []
    for elem in soup.find_all(['div', 'p', 'span', 'article']):
        if any(word in elem.get('class', []) for word in ['comment', 'reply', 'review']):
            text = elem.get_text().strip()
            if len(text) > 12:
                comments.append(text)
    
    return comments

def analyze_sentiment(text):
    words = re.findall(r'\b\w+\b', text.lower())
    score = 0
    for i, word in enumerate(words):
        if word in positive_words:
            score += 1
        elif word in negative_words:
            score -= 1
        
        # Check for negation
        if i > 0 and words[i-1] in negation_words:
            score *= -1
    
    return "positive" if score > 0 else ("negative" if score < 0 else "neutral")

# Process file
comments = extract_comments("r_news/r_news.html")
if not comments:
    print("No comments found")
else:
    sentiments = [analyze_sentiment(comment) for comment in comments]
    counts = Counter(sentiments)
    
    print(f"Overall tone: {max(counts, key=counts.get)}")
    print(f"Counts: {dict(counts)}")
    
    # Show examples
    print("\nSample comments:")
    for i, comment in enumerate(comments[:5]):
        print(f"{i+1}. [{sentiments[i]}] {comment[:100]}...")


No comments found


In [26]:
from bs4 import BeautifulSoup
import matplotlib.pyplot as plt
from collections import Counter
import re

# ----------- Input File -----------
file_path = r"r_news.html"

# ----------- Basic Sentiment Words -----------
positive_words = ["good","great","excellent","amazing","love","like","nice","awesome","helpful","best"]
negative_words = ["bad","terrible","awful","hate","slow","buggy","confusing","broken","issue","worst"]

# ----------- Extract Comments from HTML -----------
def extract_comments(file_path):
    with open(file_path, "r", encoding="utf-8") as f:
        soup = BeautifulSoup(f.read(), "html.parser")

    comments = []
    
    # Find any tag with class having "comment" or "reply"
    for tag in soup.find_all(True, class_=re.compile("comment|reply", re.I)):
        text = tag.get_text(strip=True)
        if len(text) > 10:
            comments.append(text)

    return comments

# ----------- Simple Sentiment Function -----------
def get_sentiment(text):
    words = text.lower().split()
    score = 0
    
    for w in words:
        if w in positive_words:
            score += 1
        if w in negative_words:
            score -= 1

    if score > 0:
        return "positive"
    elif score < 0:
        return "negative"
    else:
        return "neutral"

# ----------- Run Analyzer -----------
comments = extract_comments(file_path)

if not comments:
    print("No comments found. Try saving a webpage with visible user comments.")
else:
    sentiments = [get_sentiment(c) for c in comments]
    counts = Counter(sentiments)

    print("Overall tone:", max(counts, key=counts.get))
    print("Counts:", dict(counts))

    # ----------- Bar Plot -----------
    labels = ["positive","neutral","negative"]
    values = [counts.get(l,0) for l in labels]

    plt.bar(labels, values)
    plt.title("Sentiment Distribution")
    plt.ylabel("Number of Comments")
    plt.show()


No comments found. Try saving a webpage with visible user comments.
