  ## Importing Libraries

In [1]:
import pandas as pd 
import numpy as np
import re #Regular expressions
import nltk

In [24]:
from sklearn.feature_extraction.text import TfidfVectorizer 
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, accuracy_score

In [25]:
# Download stopwords + lemmatizer
nltk.download('stopwords')
nltk.download('wordnet')
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Rachi\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\Rachi\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


## Loading the AG News dataset

In [26]:
test_df=pd.read_csv("Test.csv")
train_df=pd.read_csv("Train.csv")

In [27]:
train_df.head()

Unnamed: 0,Class Index,Title,Description
0,3,Wall St. Bears Claw Back Into the Black (Reuters),"Reuters - Short-sellers, Wall Street's dwindli..."
1,3,Carlyle Looks Toward Commercial Aerospace (Reu...,Reuters - Private investment firm Carlyle Grou...
2,3,Oil and Economy Cloud Stocks' Outlook (Reuters),Reuters - Soaring crude prices plus worries\ab...
3,3,Iraq Halts Oil Exports from Main Southern Pipe...,Reuters - Authorities have halted oil export\f...
4,3,"Oil prices soar to all-time record, posing new...","AFP - Tearaway world oil prices, toppling reco..."


In [28]:
test_df.head()

Unnamed: 0,Class Index,Title,Description
0,3,Fears for T N pension after talks,Unions representing workers at Turner Newall...
1,4,The Race is On: Second Private Team Sets Launc...,"SPACE.com - TORONTO, Canada -- A second\team o..."
2,4,Ky. Company Wins Grant to Study Peptides (AP),AP - A company founded by a chemistry research...
3,4,Prediction Unit Helps Forecast Wildfires (AP),AP - It's barely dawn when Mike Fitzpatrick st...
4,4,Calif. Aims to Limit Farm-Related Smog (AP),AP - Southern California's smog-fighting agenc...


In [14]:
train_df.columns = ["label", "title", "description"]
test_df.columns = ["label", "title", "description"]

## Combine title and description into one text column

In [16]:
train_df["text"] = train_df["title"] + " " + train_df["description"]
test_df["text"] = test_df["title"] + " " + test_df["description"]

## Preprocessing function for cleaning text

In [18]:
stop_words = set(stopwords.words('english'))   # list of common words like 'is', 'the'
lemmatizer = WordNetLemmatizer()               # tool to convert words to base form

def preprocess(text):
    text = text.lower()                         # convert to lowercase (Hello → hello)
    text = re.sub(r'[^a-z\s]', '', text)        # remove numbers and punctuation
    tokens = text.split()                       # split sentence into words
    tokens = [lemmatizer.lemmatize(w) for w in tokens if w not in stop_words]
    return " ".join(tokens)                     # join words back into a cleaned sentence

# Apply this function to both training and test data
train_df["clean_text"] = train_df["text"].apply(preprocess)
test_df["clean_text"] = test_df["text"].apply(preprocess)

## Separate features (X) and labels (y)

In [19]:
X_train = train_df["clean_text"]   # the cleaned news text
y_train = train_df["label"]        # the category number (1,2,3,4)
X_test = test_df["clean_text"]
y_test = test_df["label"]

## Convert text to numbers using TF-IDF

In [29]:
vectorizer = TfidfVectorizer(max_features=5000)   # only take top 5000 words
X_train_tfidf = vectorizer.fit_transform(X_train) # learn from training data
X_test_tfidf = vectorizer.transform(X_test)       # apply same rules to test data

## Train Logistic Regression model

In [30]:
from sklearn.linear_model import LogisticRegression
model = LogisticRegression(max_iter=1000)   # create model
model.fit(X_train_tfidf, y_train)           # train model with training data

## Test the model on unseen data

In [32]:
 y_pred = model.predict(X_test_tfidf)        # make predictions

# Show accuracy
print("Accuracy:", accuracy_score(y_test, y_pred))

# Show detailed report
print("\nClassification Report:\n", classification_report(
    y_test, y_pred, target_names=["World","Sports","Business","Sci/Tech"]
))

Accuracy: 0.9064473684210527

Classification Report:
               precision    recall  f1-score   support

       World       0.92      0.90      0.91      1900
      Sports       0.95      0.97      0.96      1900
    Business       0.88      0.87      0.87      1900
    Sci/Tech       0.88      0.88      0.88      1900

    accuracy                           0.91      7600
   macro avg       0.91      0.91      0.91      7600
weighted avg       0.91      0.91      0.91      7600

