In [None]:
#import libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import re
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import MultinomialNB
from sklearn.svm import SVC
from sklearn.metrics import ConfusionMatrixDisplay
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix, roc_auc_score

In [None]:
#PartB: load dataset
news_df=pd.read_csv(r'C:\Users\user\Downloads\data_news - data_news.csv')
news_df.head()

In [None]:
#1. Data Collection and Preprocessing
news_df.head()
news_df.info()
news_df['category'].value_counts()

In [None]:
sns.countplot(x='category', data=news_df)
plt.title("Category Distribution")
plt.xticks(rotation=45)
plt.show()


In [None]:
stop_words = set(stopwords.words('english'))
lemmatizer = WordNetLemmatizer()

def preprocess(text):
    if pd.isnull(text):
        return ""
    text = re.sub(r'<.*?>', '', str(text))             # Remove HTML tags
    text = re.sub(r'[^a-zA-Z]', ' ', text)             # Remove non-letter characters
    text = text.lower()                                # Convert to lowercase
    tokens = text.split()                              # Tokenize
    tokens = [lemmatizer.lemmatize(w) for w in tokens if w not in stop_words]
    return " ".join(tokens)

news_df['cleaned_text'] = news_df['short_description'].apply(preprocess)

In [None]:
#2. Feature Extraction 
tfidf = TfidfVectorizer(max_features=5000)
X = tfidf.fit_transform(news_df['cleaned_text']).toarray()

le = LabelEncoder()
y = le.fit_transform(news_df['category'])  # Converts labels to integers

In [None]:
# 3. Model Development and Training 
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

In [None]:
models = {
    "Logistic Regression": LogisticRegression(max_iter=1000),
    "Naive Bayes": MultinomialNB(),
    "SVM": SVC(kernel='linear')
}

for name, model in models.items():
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    print(f"===== {name} =====")
    print(classification_report(y_test, y_pred, target_names=le.classes_))
    print("Accuracy:", accuracy_score(y_test, y_pred))
    print()


In [None]:
# 4. Model Evaluation
best_model = LogisticRegression(max_iter=1000)
best_model.fit(X_train, y_train)
y_pred = best_model.predict(X_test)

ConfusionMatrixDisplay.from_predictions(y_test, y_pred, display_labels=le.classes_, cmap='Blues')
plt.title("Confusion Matrix - Logistic Regression")
plt.show()


In [None]:
feature_names = tfidf.get_feature_names_out()
coefs = best_model.coef_

for i, class_label in enumerate(le.classes_):
    top_features = np.argsort(coefs[i])[-10:]
    print(f"Top words for {class_label}:")
    print([feature_names[j] for j in top_features][::-1])
    print()

In [None]:
'''Insights:
This model performs best on categories with domain-specific terms (Food, Sports, Style).
No deep semantic understanding.
Struggles with overlapping categories
Logistic Regression achieved the accuracy of 65.87%, with strong performance on categories like Style & Beauty, Food & Drink, and Sports.'''