In [36]:
import pandas as pd

df = pd.read_csv("data/subject_dataset.csv", encoding="utf-8", on_bad_lines='warn')
print(df.head())
print(df.info())


                                            question            subject
0   What is the main purpose of an operating system?  Operating Systems
1  Explain the difference between preemptive and ...  Operating Systems
2       What is a deadlock? How can it be prevented?  Operating Systems
3            Describe the concept of virtual memory.  Operating Systems
4  What are the different process states in an op...  Operating Systems
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 338 entries, 0 to 337
Data columns (total 2 columns):
 #   Column    Non-Null Count  Dtype 
---  ------    --------------  ----- 
 0   question  338 non-null    object
 1   subject   338 non-null    object
dtypes: object(2)
memory usage: 5.4+ KB
None


Skipping line 62: expected 2 fields, saw 4
Skipping line 302: expected 2 fields, saw 4
Skipping line 318: expected 2 fields, saw 4
Skipping line 326: expected 2 fields, saw 4

  df = pd.read_csv("data/subject_dataset.csv", encoding="utf-8", on_bad_lines='warn')


In [37]:
import re

def clean_text(text):
    text = text.lower()  # Convert to lowercase
    text = re.sub(r'[^a-z0-9\s]', '', text)  # Remove punctuation
    text = re.sub(r'\s+', ' ', text).strip()  # Remove extra whitespace
    return text

df['clean_question'] = df['question'].apply(clean_text)


In [38]:
from sklearn.feature_extraction.text import TfidfVectorizer

# Create TF-IDF vectorizer with unigrams and bigrams, and remove common stop words
vectorizer = TfidfVectorizer(ngram_range=(1, 2), stop_words='english')

X = vectorizer.fit_transform(df['clean_question'])
y = df['subject']


In [39]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

print("Training samples:", X_train.shape[0])
print("Testing samples:", X_test.shape[0])


Training samples: 270
Testing samples: 68


In [41]:
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import GridSearchCV

# Define model with improved solver for multiclass
lr = LogisticRegression(max_iter=1000, solver='saga')  # removed deprecated multi_class

param_grid = {
    'C': [0.1, 1, 10],
    'penalty': ['l2']  # saga supports only l2 for multinomial
}

# Grid Search
grid = GridSearchCV(lr, param_grid, cv=5, scoring='accuracy')
grid.fit(X_train, y_train)

# Best model
best_model = grid.best_estimator_

# Predict & evaluate
y_pred = best_model.predict(X_test)

from sklearn.metrics import accuracy_score, classification_report

print("Accuracy:", accuracy_score(y_test, y_pred))
print("\nClassification Report:\n", classification_report(y_test, y_pred))


Accuracy: 0.7647058823529411

Classification Report:
                        precision    recall  f1-score   support

Computer Architecture       0.75      0.33      0.46         9
    Computer Networks       0.44      0.70      0.54        10
                 DBMS       0.90      0.90      0.90        10
                  DSA       0.73      0.80      0.76        10
                 OOPS       1.00      0.90      0.95        10
    Operating Systems       0.80      0.80      0.80        10
 Software Engineering       1.00      0.89      0.94         9

             accuracy                           0.76        68
            macro avg       0.80      0.76      0.76        68
         weighted avg       0.80      0.76      0.77        68



In [43]:
from sklearn.preprocessing import LabelEncoder

label_encoder = LabelEncoder()
y_encoded = label_encoder.fit_transform(df['subject'])

# Use y_encoded for training instead of raw y


In [44]:
import joblib

joblib.dump(label_encoder, 'label_encoder.pkl')


['label_encoder.pkl']

In [45]:
import os

os.rename('model.pkl', 'doubt_classifier_model.pkl')
