In [8]:
# =================== INSTALL REQUIRED LIBRARIES ===================

#!pip install pandas nltk scikit-learn keras tensorflow newspaper3k networkx
!pip install lxml_html_clean
!pip install newspaper
# # =================== IMPORTS ===================
import pandas as pd
import numpy as np
import nltk
import networkx as nx
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize, sent_tokenize
from nltk.stem import PorterStemmer

from keras.utils import to_categorical
from keras.preprocessing.sequence import pad_sequences
from keras.models import Sequential, load_model
from keras.layers import Embedding, LSTM, Dense, Dropout
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from newspaper import Article
from sklearn.metrics.pairwise import cosine_similarity
nltk.download('punkt_tab')
nltk.download('punkt')
nltk.download('stopwords')
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

# =================== TEXT CLEANING ===================
stop_words = set(stopwords.words('english'))
stemmer = PorterStemmer()

def clean_text(text):
    text = str(text).lower()
    tokens = word_tokenize(text)
    tokens = [t for t in tokens if t.isalpha()]
    tokens = [t for t in tokens if t not in stop_words]
    tokens = [stemmer.stem(t) for t in tokens]
    return ' '.join(tokens)

# =================== LOAD & PROCESS DATA ===================
from google.colab import drive
drive.mount('/content/drive')
df = pd.read_csv("/content/drive/MyDrive/Data/train.csv")
df.columns = ['class_id', 'title', 'description']

category_map = {
    1: 'World',
    2: 'Sports',
    3: 'Business',
    4: 'Sci/Tech'
}
df['category'] = df['class_id'].map(category_map)
df['text'] = df['title'].fillna('') + ". " + df['description'].fillna('')
df['cleaned'] = df['text'].apply(clean_text)

# =================== TOKENIZATION ===================
max_words = 5000
max_len = 200

tokenizer = Tokenizer(num_words=max_words, oov_token="<OOV>")
tokenizer.fit_on_texts(df['cleaned'])
sequences = tokenizer.texts_to_sequences(df['cleaned'])
X = pad_sequences(sequences, maxlen=max_len)

# =================== LABEL ENCODING ===================
le = LabelEncoder()
y = le.fit_transform(df['category'])
y = to_categorical(y)

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# =================== LSTM MODEL ===================
model = Sequential()
model.add(Embedding(input_dim=max_words, output_dim=64, input_length=max_len))
model.add(LSTM(64, return_sequences=False))
model.add(Dropout(0.5))
model.add(Dense(32, activation='relu'))
model.add(Dense(4, activation='softmax'))

model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
model.summary()

# =================== TRAINING ===================
model.fit(X_train, y_train, epochs=5, batch_size=32, validation_split=0.2)

# =================== EVALUATION ===================
loss, acc = model.evaluate(X_test, y_test)
print(f"\n✅ Test Accuracy: {acc:.2f}")

# =================== SUMMARIZATION FUNCTION ===================
def summarize(text, top_n=2, max_words=40):
    sentences = sent_tokenize(text)
    if len(sentences) <= top_n:
        return ' '.join(sentences[:top_n])

    tfidf = Tokenizer()
    tfidf.fit_on_texts(sentences)
    tfidf_matrix = np.array([
        np.mean([tfidf.word_index.get(w, 0) for w in word_tokenize(s.lower()) if w.isalpha()], dtype=float)
        for s in sentences
    ]).reshape(-1, 1)

    sim_matrix = cosine_similarity(tfidf_matrix)
    nx_graph = nx.from_numpy_array(sim_matrix)
    scores = nx.pagerank(nx_graph)

    ranked = sorted(((scores[i], s, i) for i, s in enumerate(sentences)), reverse=True)

    summary = []
    total_words = 0
    for _, sentence, idx in sorted(ranked[:len(sentences)], key=lambda x: x[2]):
        word_count = len(sentence.split())
        if total_words + word_count <= max_words:
            summary.append(sentence)
            total_words += word_count
        if len(summary) >= top_n or total_words >= max_words:
            break

    return ' '.join(summary)

# =================== URL FETCH FUNCTION ===================
def fetch_article(url):
    article = Article(url)
    article.download()
    article.parse()
    return article.title, article.text

# =================== PREDICT FUNCTION ===================
def infer_news_category_and_summary_dl(url):
    title, text = fetch_article(url)
    cleaned = clean_text(title + ". " + text)
    seq = tokenizer.texts_to_sequences([cleaned])
    padded = pad_sequences(seq, maxlen=max_len)

    prediction = model.predict(padded)
    category = le.inverse_transform([np.argmax(prediction)])[0]
    summary = summarize(text)
    return {
        "title": title,
        "category": category,
        "summary": summary
    }

# =================== TEST URL ===================
url = "https://www.thehindu.com/news/international/spacex-rocket-being-tested-in-texas-explodes-but-no-injuries-reported/article69712200.ece"
result = infer_news_category_and_summary_dl(url)

print("\n📰 Title:\n", result['title'])
print("\n📂 Predicted Category:", result['category'])
print("\n📝 Summary:\n", result['summary'])


Collecting lxml_html_clean
  Downloading lxml_html_clean-0.4.2-py3-none-any.whl.metadata (2.4 kB)
Downloading lxml_html_clean-0.4.2-py3-none-any.whl (14 kB)
Installing collected packages: lxml_html_clean
Successfully installed lxml_html_clean-0.4.2
Collecting newspaper
  Downloading newspaper-0.1.0.7.tar.gz (176 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m176.9/176.9 kB[0m [31m5.0 MB/s[0m eta [36m0:00:00[0m
[?25h  [1;31merror[0m: [1msubprocess-exited-with-error[0m
  
  [31m×[0m [32mpython setup.py egg_info[0m did not run successfully.
  [31m│[0m exit code: [1;36m1[0m
  [31m╰─>[0m See above for output.
  
  [1;35mnote[0m: This error originates from a subprocess, and is likely not a problem with pip.
  Preparing metadata (setup.py) ... [?25l[?25herror
[1;31merror[0m: [1mmetadata-generation-failed[0m

[31m×[0m Encountered error while generating package metadata.
[31m╰─>[0m See above for output.

[1;35mnote[0m: This is an issue with 

[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).




Epoch 1/5
[1m2400/2400[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m29s[0m 11ms/step - accuracy: 0.7995 - loss: 0.5446 - val_accuracy: 0.9057 - val_loss: 0.2860
Epoch 2/5
[1m2400/2400[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m26s[0m 11ms/step - accuracy: 0.9162 - loss: 0.2604 - val_accuracy: 0.9034 - val_loss: 0.2853
Epoch 3/5
[1m2400/2400[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m26s[0m 11ms/step - accuracy: 0.9264 - loss: 0.2167 - val_accuracy: 0.9024 - val_loss: 0.2870
Epoch 4/5
[1m2400/2400[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m41s[0m 11ms/step - accuracy: 0.9376 - loss: 0.1832 - val_accuracy: 0.9003 - val_loss: 0.2922
Epoch 5/5
[1m2400/2400[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m26s[0m 11ms/step - accuracy: 0.9425 - loss: 0.1609 - val_accuracy: 0.9038 - val_loss: 0.3223
[1m750/750[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 5ms/step - accuracy: 0.8981 - loss: 0.3328

✅ Test Accuracy: 0.90
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[