## Loading Datasets

In [1]:
import pandas as pd

In [2]:
# Loading train and test datasets
train_df = pd.read_csv('train.csv')
test_df = pd.read_csv('test.csv')

In [3]:
# Checking the first few rows of the train dataset
print(train_df.head())

   Class Index                                              Title  \
0            3  Wall St. Bears Claw Back Into the Black (Reuters)   
1            3  Carlyle Looks Toward Commercial Aerospace (Reu...   
2            3    Oil and Economy Cloud Stocks' Outlook (Reuters)   
3            3  Iraq Halts Oil Exports from Main Southern Pipe...   
4            3  Oil prices soar to all-time record, posing new...   

                                         Description  
0  Reuters - Short-sellers, Wall Street's dwindli...  
1  Reuters - Private investment firm Carlyle Grou...  
2  Reuters - Soaring crude prices plus worries\ab...  
3  Reuters - Authorities have halted oil export\f...  
4  AFP - Tearaway world oil prices, toppling reco...  


In [None]:
# Checking column names and basic info
print(train_df.info())

## Text Preprocessing

In [5]:
# Importing Necessary Values 
import re
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
import nltk


In [6]:
# Downloading NLTK resources
nltk.download('stopwords')
nltk.download('punkt')
nltk.download('wordnet')

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\adhik\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\adhik\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\adhik\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [7]:
# Initializing lemmatizer and stopwords
lemmatizer = WordNetLemmatizer()
stop_words = set(stopwords.words('english'))

In [9]:
# Combining Title and Description into a single column 
train_df['text'] = train_df['Title'] + " " + train_df['Description']
test_df['text'] = test_df['Title'] + " " + test_df['Description']

In [11]:
# Function for text cleaning 
def preprocess_text(text):
    #Converting to lowecase
    text = text.lower()
    #Removing special characters and numbers 
    text = re.sub(r'[^a-zA-Z\s]', '',text)
    #Tokenizing
    tokens = word_tokenize(text)
    #Removing stopwords and lemmatize 
    tokens = [lemmatizer.lemmatize(word) for word in tokens if word not in stop_words]
    #Rejoining tokens in a single string
    return ' '.join(tokens)

#Applying preprocessing 
train_df['cleaned_text'] = train_df['text'].apply(preprocess_text)
test_df['cleaned_text'] = test_df['text'].apply(preprocess_text)

#Verifying the results
print(train_df[['text' , 'cleaned_text']].head())

                                                text  \
0  Wall St. Bears Claw Back Into the Black (Reute...   
1  Carlyle Looks Toward Commercial Aerospace (Reu...   
2  Oil and Economy Cloud Stocks' Outlook (Reuters...   
3  Iraq Halts Oil Exports from Main Southern Pipe...   
4  Oil prices soar to all-time record, posing new...   

                                        cleaned_text  
0  wall st bear claw back black reuters reuters s...  
1  carlyle look toward commercial aerospace reute...  
2  oil economy cloud stock outlook reuters reuter...  
3  iraq halt oil export main southern pipeline re...  
4  oil price soar alltime record posing new menac...  


## Feature Extraction

In this step, we convert the cleaned text data into numerical format, which machine learning models can understand. We'll use the TF-IDF (Term Frequency-Inverse Document Frequency) method for this

In [12]:
# Importing necesary library
from sklearn.feature_extraction.text import TfidfVectorizer

In [13]:
# Initializing TF-IDF Vectorizer
tfidf = TfidfVectorizer(max_features=5000)

In [14]:
# Fiting and transforming the cleaned text for training and testing datasets
X_train = tfidf.fit_transform(train_df['cleaned_text']).toarray()
X_test = tfidf.transform(test_df['cleaned_text']).toarray()


In [15]:
#Target variable (Class Index)
y_train = train_df['Class Index']
y_test = test_df['Class Index']

In [16]:
# Checking the shape of the feature matrix
print(f"X_train shape: {X_train.shape}")
print(f"X_test shape: {X_test.shape}")

X_train shape: (120000, 5000)
X_test shape: (7600, 5000)


## Model Training and Evaluation

We'll use Logistic Regression which is simple but effective algorithm and it works wekk with TF-IDF features

In [17]:
#Importing necessary library 
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score , classification_report , confusion_matrix

In [19]:
# Initializing the logistic Regression model
model = LogisticRegression(max_iter=1000 , random_state=42)

In [20]:
# Training the model on the training data
model.fit(X_train , y_train)

In [21]:
# Make predictions on the test data 
y_pred = model.predict(X_test)

In [22]:
# Evaluating the model's performance
accuracy = accuracy_score(y_test, y_pred)
print(f"Accuracy: {accuracy:.2f}")

# Detailed classification report
print("Classification Report:")
print(classification_report(y_test, y_pred))

# Confusion matrix
print("Confusion Matrix:")
print(confusion_matrix(y_test, y_pred))

Accuracy: 0.91
Classification Report:
              precision    recall  f1-score   support

           1       0.92      0.90      0.91      1900
           2       0.95      0.98      0.96      1900
           3       0.88      0.87      0.87      1900
           4       0.88      0.88      0.88      1900

    accuracy                           0.91      7600
   macro avg       0.91      0.91      0.91      7600
weighted avg       0.91      0.91      0.91      7600

Confusion Matrix:
[[1707   62   75   56]
 [  19 1854   12   15]
 [  66   21 1648  165]
 [  59   24  139 1678]]


## Saving the Model and Vectorizer

In [25]:
#Importing Necessary Library
import os
import joblib

In [27]:
# Creating the 'models' folder if it doesn't exist
if not os.path.exists('models'):
    os.makedirs('models')

In [28]:
# Saving the Logistic Regression model
joblib.dump(model, 'models/logistic_regression_model.pkl')

# Saving the TF-IDF vectorizer
joblib.dump(tfidf, 'models/tfidf_vectorizer.pkl')

['models/tfidf_vectorizer.pkl']