In [None]:
from google.colab import drive
drive.mount('/content/drive')
!pip install newspaper3k scikit-learn nltk networkx joblib
!pip install lxml_html_clean
!pip install newspaper

In [None]:
import pandas as pd
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize, sent_tokenize
from nltk.stem import PorterStemmer

from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.pipeline import Pipeline
from sklearn.metrics import classification_report
from joblib import dump, load

from newspaper import Article
from sklearn.metrics.pairwise import cosine_similarity
import networkx as nx

# Download NLTK resources if not already done
nltk.download('punkt')
nltk.download('stopwords')

# Initialize stopwords and stemmer
stop_words = set(stopwords.words('english'))
stemmer = PorterStemmer()

# Clean text function: keeps only sentences with at least 8 cleaned words
def clean_text(text):
    text = str(text).lower()
    sentences = sent_tokenize(text)
    cleaned_sentences = []

    for sentence in sentences:
        tokens = word_tokenize(sentence)
        tokens = [t for t in tokens if t.isalpha()]
        tokens = [t for t in tokens if t not in stop_words]
        tokens = [stemmer.stem(t) for t in tokens]

        if len(tokens) >= 8:
            cleaned_sentences.append(' '.join(tokens))

    return ' '.join(cleaned_sentences)

# Load dataset
df = pd.read_csv("/content/drive/MyDrive/Data/train.csv")
df.columns = ['class_id', 'title', 'description']

# Map class IDs to category labels
category_map = {
    1: 'World',
    2: 'Sports',
    3: 'Business',
    4: 'Sci/Tech'
}
df['category'] = df['class_id'].map(category_map)

# Combine title and description
df['text'] = df['title'].fillna('') + ". " + df['description'].fillna('')

# Apply cleaning
df['cleaned'] = df['text'].apply(clean_text)

# Remove rows with empty cleaned text
df = df[df['cleaned'].str.strip() != '']

# Show a sample
print(df[['category', 'text']].head())

# Train-test split
X_train, X_test, y_train, y_test = train_test_split(df['cleaned'], df['category'], test_size=0.2, random_state=42)

# Pipeline with TF-IDF and Naive Bayes
pipeline = Pipeline([
    ('tfidf', TfidfVectorizer(max_features=5000)),
    ('nb', MultinomialNB())
])

# Train model
pipeline.fit(X_train, y_train)

# Evaluate model
y_pred = pipeline.predict(X_test)
print(classification_report(y_test, y_pred))

# Save model
dump(pipeline, "news_classifier.joblib")



In [None]:
import feedparser
from datetime import datetime
from newspaper import Article
from joblib import load
import nltk
import networkx as nx
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize, sent_tokenize
from nltk.stem import PorterStemmer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

# Download NLTK resources if not present
nltk.download('punkt')
nltk.download('stopwords')

# Load classifier from earlier model
clf = load("news_classifier.joblib")

# Setup NLP tools
stop_words = set(stopwords.words('english'))
stemmer = PorterStemmer()

def clean_text(text):
    text = str(text).lower()
    sentences = sent_tokenize(text)
    cleaned_sentences = []
    for sentence in sentences:
        tokens = word_tokenize(sentence)
        tokens = [t for t in tokens if t.isalpha()]
        tokens = [t for t in tokens if t not in stop_words]
        tokens = [stemmer.stem(t) for t in tokens]
        if len(tokens) >= 8:
            cleaned_sentences.append(' '.join(tokens))
    return ' '.join(cleaned_sentences)

def summarize(text, top_n=2, max_words=40):
    sentences = sent_tokenize(text)
    if len(sentences) <= top_n:
        return ' '.join(sentences)
    
    tfidf = TfidfVectorizer().fit_transform(sentences)
    sim_matrix = cosine_similarity(tfidf)
    nx_graph = nx.from_numpy_array(sim_matrix)
    scores = nx.pagerank(nx_graph)

    ranked = sorted(((scores[i], s, i) for i, s in enumerate(sentences)), reverse=True)
    
    summary = []
    total_words = 0
    for _, sentence, idx in sorted(ranked, key=lambda x: x[2]):
        word_count = len(sentence.split())
        if total_words + word_count <= max_words:
            summary.append(sentence)
            total_words += word_count
        if len(summary) >= top_n or total_words >= max_words:
            break
    return ' '.join(summary)

def fetch_article_details(url):
    try:
        article = Article(url)
        article.download()
        article.parse()
        title, text = article.title, article.text
        cleaned = clean_text(title + ". " + text)
        predicted_category = clf.predict([cleaned])[0]
        summary = summarize(text)
        return {
            "title": title,
            "url": url,
            "predicted_category": predicted_category,
            "summary": summary
        }
    except Exception as e:
        return None

def fetch_top_articles_from_rss(date_str, category_rss_path):
    url = f"https://www.thehindu.com/{category_rss_path}/?service=rss"
    feed = feedparser.parse(url)
    target_date = datetime.strptime(date_str, "%Y-%m-%d").date()

    articles = []
    for entry in feed.entries:
        if hasattr(entry, 'published_parsed'):
            pub_date = datetime(*entry.published_parsed[:6]).date()
            if pub_date == target_date:
                details = fetch_article_details(entry.link)
                if details:
                    articles.append(details)
        if len(articles) >= 5:
            break
    return articles

def main():
    date_input = input("📅 Enter date (YYYY-MM-DD): ").strip()

    category_mapping = {
        "World": "news/international",
        "Business": "business",
        "Sports": "sport",
        "Sci/Tech": "sci-tech"
    }

    final_results = {}

    for category_name, rss_path in category_mapping.items():
        print(f"\n🔍 Fetching top 5 articles for **{category_name}** on {date_input}...")
        articles = fetch_top_articles_from_rss(date_input, rss_path)
        final_results[category_name] = articles

        for idx, article in enumerate(articles, 1):
            print(f"\n{idx}. 📰 Title: {article['title']}")
            print(f"   🔗 URL: {article['url']}")
            print(f"   📂 Predicted Category: {article['predicted_category']}")
            print(f"   📝 Summary:\n{article['summary'][:300]}{'...' if len(article['summary']) > 300 else ''}")

if __name__ == "__main__":
    main()
