In [None]:
!pip install numpy==1.26.4




In [None]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
from transformers import AutoTokenizer, AutoModelForSequenceClassification, Trainer, TrainingArguments, EarlyStoppingCallback
import pandas as pd
from sklearn.preprocessing import LabelEncoder
from datasets import Dataset
from sklearn.metrics import accuracy_score, precision_recall_fscore_support
import torch
import requests

# Test Hugging Face connectivity
try:
    response = requests.get("https://huggingface.co", timeout=30)
    print(f"Status: {response.status_code}")
    print("Hugging Face is accessible")
except Exception as e:
    print(f"Cannot reach Hugging Face: {e}")


from huggingface_hub import HfApi, hf_hub_download, login

model_name = 'distilbert-base-uncased'
api = HfApi()
hf_token = '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'
login(token=hf_token)

repo_files = api.list_repo_files(repo_id=model_name)
repo_files
for file in repo_files:
    hf_hub_download(repo_id=model_name, filename=file, local_dir='./sentiment',token=hf_token)


# Load and preprocess data
df = pd.read_csv("/content/news.csv")
df = df[['news', 'sentiment']]

# Encode labels
le = LabelEncoder()
df['sentiment'] = le.fit_transform(df['sentiment'])

# Create dataset
hf_df = Dataset.from_pandas(df)
dataset = hf_df.train_test_split(test_size=0.2)

# Load model and tokenizer
model_name = "sentiment"
print(f"Loading tokenizer for {model_name}...")
tokenizer = AutoTokenizer.from_pretrained(model_name)

print(f"Loading model for {model_name}...")
# Get number of unique labels for classification
num_labels = len(le.classes_)
model = AutoModelForSequenceClassification.from_pretrained(
    model_name,
    num_labels=num_labels
)

# Tokenization function
def tokenize(batch):
    return tokenizer(batch['news'], padding="max_length", truncation=True, max_length=128,return_tensors="pt")

# Apply tokenization
print("Tokenizing dataset...")
dataset = dataset.map(tokenize, batched=True)

# Rename 'sentiment' column to 'labels' (required by Trainer)
dataset = dataset.rename_column('sentiment', 'labels')

# Set format for PyTorch
dataset.set_format("torch", columns=["input_ids", "attention_mask", "labels"])

# Training arguments
training_args = TrainingArguments(
    output_dir="./results",
    num_train_epochs=5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=32,
    eval_strategy="epoch",
    logging_dir="./logs",
    logging_steps=10,
    save_strategy="epoch",
    load_best_model_at_end=True,  # Should be boolean, not string
    metric_for_best_model="eval_loss",
    report_to=None,  # Disable wandb if not needed
    fp16=True,
)

# Compute metrics function
def compute_metrics(eval_pred):
    logits, labels = eval_pred
    preds = logits.argmax(axis=-1)
    acc = accuracy_score(labels, preds)  # Fixed variable name from 'pred' to 'preds'
    prec, rec, f1, _ = precision_recall_fscore_support(labels, preds, average='weighted')
    return {"accuracy": acc, "precision": prec, "recall": rec, "f1": f1}

# Create trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=dataset["train"],
    eval_dataset=dataset["test"],  # Changed from "test" to "test" (this was correct)
    compute_metrics=compute_metrics,
    callbacks=[EarlyStoppingCallback(early_stopping_patience=2)]
)

# Train the model
print("Starting training...")
trainer.train()

# Save model and tokenizer
print("Saving model and tokenizer...")
model.save_pretrained("/content/drive/MyDrive/sentiment_classifier")
tokenizer.save_pretrained("/content/drive/MyDrive/sentiment_classifier") # Fixed: was 'tokenize.save_pretained'

# Evaluate the model
print("Evaluating model...")
results = trainer.evaluate()
print("Final Results:")
print(results)

#save the results in a file
import json
with open("/content/drive/MyDrive/training_results.json", "w") as f:
    json.dump(results, f, indent=2)


# Print label classes for reference
print(f"\nLabel classes: {le.classes_}")
print(f"Number of labels: {num_labels}")

print("All files saved to Google Drive successfully!")

ImportError: cannot import name 'Trainer' from 'transformers' (/usr/local/lib/python3.11/dist-packages/transformers/__init__.py)

In [None]:
from transformers import AutoTokenizer, AutoModelForSequenceClassification
import torch

model_path = "/content/drive/MyDrive/sentiment_classifier_final"

tokenizer = AutoTokenizer.from_pretrained(model_path)
model = AutoModelForSequenceClassification.from_pretrained(model_path)
model.eval()  # Set model to evaluation mode


DistilBertForSequenceClassification(
  (distilbert): DistilBertModel(
    (embeddings): Embeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (transformer): Transformer(
      (layer): ModuleList(
        (0-5): 6 x TransformerBlock(
          (attention): DistilBertSdpaAttention(
            (dropout): Dropout(p=0.1, inplace=False)
            (q_lin): Linear(in_features=768, out_features=768, bias=True)
            (k_lin): Linear(in_features=768, out_features=768, bias=True)
            (v_lin): Linear(in_features=768, out_features=768, bias=True)
            (out_lin): Linear(in_features=768, out_features=768, bias=True)
          )
          (sa_layer_norm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
          (ffn): FFN(
            (dropout): Dropout(p=0.1, inplace=False)


In [None]:
pip install feedparser

Collecting feedparser
  Downloading feedparser-6.0.11-py3-none-any.whl.metadata (2.4 kB)
Collecting sgmllib3k (from feedparser)
  Downloading sgmllib3k-1.0.0.tar.gz (5.8 kB)
  Preparing metadata (setup.py) ... [?25l[?25hdone
Downloading feedparser-6.0.11-py3-none-any.whl (81 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m81.3/81.3 kB[0m [31m7.2 MB/s[0m eta [36m0:00:00[0m
[?25hBuilding wheels for collected packages: sgmllib3k
  Building wheel for sgmllib3k (setup.py) ... [?25l[?25hdone
  Created wheel for sgmllib3k: filename=sgmllib3k-1.0.0-py3-none-any.whl size=6046 sha256=67e2d89479783b2132e5a2866dc3307fa94b43edf1ddabef8a312893f587b7e2
  Stored in directory: /root/.cache/pip/wheels/3b/25/2a/105d6a15df6914f4d15047691c6c28f9052cc1173e40285d03
Successfully built sgmllib3k
Installing collected packages: sgmllib3k, feedparser
Successfully installed feedparser-6.0.11 sgmllib3k-1.0.0


In [None]:
import feedparser

rss_url = "https://news.google.com/rss/search?q=Vadapalani+Chennai&hl=en-IN&gl=IN&ceid=IN:en"
feed = feedparser.parse(rss_url)

news_texts = [entry.title for entry in feed.entries]
print(news_texts)


['Open stormwater drains still dot Vadapalani roads - Times of India', 'Chennai Metro Rail to build skywalk to link phase I and phase II networks at Vadapalani - The Hindu', 'Chennai Set For Two New Transit-Oriented Developments As CMRL Plans High-Rise Hubs At Vadapalani, Mandaveli - Swarajyamag', "Chennai's Vadapalani bus terminus\nset for major overhaul - The New Indian Express", 'Vadapalani depot to get a facelift for Rs 481 crore - dtnext', 'CMAML floats tender to develop Vadapalani depot costing ₹481 crore - BusinessLine', 'Integrated Bus Hub with Commercial Complex Coming Up in Vadapalani at ₹800 Crore - LiveChennai', 'Chennai Metro Rail Limited to build multi-modal transit hubs in Vadapalani and Mandaveli - The Hindu', 'Underage driving: 14-year-old boy crashes dad’s car in Chennai; hits auto, pedestrian - Times of India', 'Vadapalani accident: Elderly man hit by minor car driver dies - dtnext', 'Four suspects linked to Chennai diamond theft case caught in Thoothukudi - The New 

In [None]:
from torch.nn.functional import softmax

def predict_sentiment(text):
    inputs = tokenizer(text, return_tensors="pt", truncation=True, padding=True)
    with torch.no_grad():
        outputs = model(**inputs)
        probs = softmax(outputs.logits, dim=1)
    predicted_class = torch.argmax(probs, dim=1).item()
    confidence = probs[0][predicted_class].item()
    return predicted_class, confidence

label_map = {0: "Negative", 1: "Positive"}  # or whatever your label mapping is

for news in news_texts:
    label, conf = predict_sentiment(news)
    print(f"📰 {news}\n➡️ Sentiment: {label_map[label]} (confidence: {conf:.2f})\n")


📰 Open stormwater drains still dot Vadapalani roads - Times of India
➡️ Sentiment: Positive (confidence: 0.92)

📰 Chennai Metro Rail to build skywalk to link phase I and phase II networks at Vadapalani - The Hindu
➡️ Sentiment: Positive (confidence: 0.93)

📰 Chennai Set For Two New Transit-Oriented Developments As CMRL Plans High-Rise Hubs At Vadapalani, Mandaveli - Swarajyamag
➡️ Sentiment: Positive (confidence: 1.00)

📰 Chennai's Vadapalani bus terminus
set for major overhaul - The New Indian Express
➡️ Sentiment: Negative (confidence: 0.98)

📰 Vadapalani depot to get a facelift for Rs 481 crore - dtnext
➡️ Sentiment: Negative (confidence: 0.99)

📰 CMAML floats tender to develop Vadapalani depot costing ₹481 crore - BusinessLine
➡️ Sentiment: Negative (confidence: 0.68)

📰 Integrated Bus Hub with Commercial Complex Coming Up in Vadapalani at ₹800 Crore - LiveChennai
➡️ Sentiment: Positive (confidence: 1.00)

📰 Chennai Metro Rail Limited to build multi-modal transit hubs in Vadapalani