Video Link: https://drive.google.com/file/d/1OuvsmREwmC-aHIJ0lY_6yaXd2KJINuxf/view?usp=sharing

In [1]:
import pandas as pd
df = pd.read_csv('data_news - data_news.csv')
print(df.head())
print("\nDataset shape:", df.shape)
print("\nClass distribution:\n", df['category'].value_counts())
print("\nMissing values:\n", df.isnull().sum())


   category                                           headline  \
0  WELLNESS              143 Miles in 35 Days: Lessons Learned   
1  WELLNESS       Talking to Yourself: Crazy or Crazy Helpful?   
2  WELLNESS  Crenezumab: Trial Will Gauge Whether Alzheimer...   
3  WELLNESS                     Oh, What a Difference She Made   
4  WELLNESS                                   Green Superfoods   

                                               links  \
0  https://www.huffingtonpost.com/entry/running-l...   
1  https://www.huffingtonpost.com/entry/talking-t...   
2  https://www.huffingtonpost.com/entry/crenezuma...   
3  https://www.huffingtonpost.com/entry/meaningfu...   
4  https://www.huffingtonpost.com/entry/green-sup...   

                                   short_description  \
0  Resting is part of training. I've confirmed wh...   
1  Think of talking to yourself as a tool to coac...   
2  The clock is ticking for the United States to ...   
3  If you want to be busy, keep trying to 

In [2]:
df.drop(columns=['keywords', 'links'], inplace=True)
df['text'] = df['headline'] + ' ' + df['short_description']
df[['text', 'category']].head()


Unnamed: 0,text,category
0,143 Miles in 35 Days: Lessons Learned Resting ...,WELLNESS
1,Talking to Yourself: Crazy or Crazy Helpful? T...,WELLNESS
2,Crenezumab: Trial Will Gauge Whether Alzheimer...,WELLNESS
3,"Oh, What a Difference She Made If you want to ...",WELLNESS
4,"Green Superfoods First, the bad news: Soda bre...",WELLNESS


In [3]:
import re
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from bs4 import BeautifulSoup

nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('omw-1.4')

stop_words = set(stopwords.words('english'))
lemmatizer = WordNetLemmatizer()

def preprocess(text):
    text = BeautifulSoup(text, "html.parser").get_text()
    text = text.lower()
    text = re.sub(r"[^a-z\s]", "", text)
    tokens = text.split()
    tokens = [lemmatizer.lemmatize(word) for word in tokens if word not in stop_words]
    return " ".join(tokens)
df["clean_text"] = df["text"].apply(preprocess)
df[["clean_text", "category"]].head()


[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\nsk2k\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\nsk2k\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to
[nltk_data]     C:\Users\nsk2k\AppData\Roaming\nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!
  text = BeautifulSoup(text, "html.parser").get_text()


Unnamed: 0,clean_text,category
0,mile day lesson learned resting part training ...,WELLNESS
1,talking crazy crazy helpful think talking tool...,WELLNESS
2,crenezumab trial gauge whether alzheimers drug...,WELLNESS
3,oh difference made want busy keep trying perfe...,WELLNESS
4,green superfoods first bad news soda bread cor...,WELLNESS


In [4]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import LabelEncoder
vectorizer = TfidfVectorizer(max_features=5000)
X = vectorizer.fit_transform(df['clean_text'])
label_encoder = LabelEncoder()
y = label_encoder.fit_transform(df['category'])
print("Feature matrix shape:", X.shape)
print("Number of categories:", len(label_encoder.classes_))


Feature matrix shape: (50000, 5000)
Number of categories: 10


In [5]:
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, accuracy_score
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
model = LogisticRegression(max_iter=200)
model.fit(X_train, y_train)
y_pred = model.predict(X_test)
print("Accuracy:", accuracy_score(y_test, y_pred))
print("\nClassification Report:\n", classification_report(y_test, y_pred, target_names=label_encoder.classes_))


Accuracy: 0.7983

Classification Report:
                 precision    recall  f1-score   support

      BUSINESS       0.73      0.78      0.75       955
 ENTERTAINMENT       0.77      0.78      0.78       985
  FOOD & DRINK       0.85      0.82      0.84      1021
     PARENTING       0.78      0.76      0.77      1030
      POLITICS       0.79      0.74      0.76      1034
        SPORTS       0.87      0.89      0.88       995
STYLE & BEAUTY       0.86      0.85      0.85       986
        TRAVEL       0.83      0.80      0.82      1008
      WELLNESS       0.72      0.75      0.74      1009
    WORLD NEWS       0.79      0.81      0.80       977

      accuracy                           0.80     10000
     macro avg       0.80      0.80      0.80     10000
  weighted avg       0.80      0.80      0.80     10000

