## Importing libraries and data

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

In [2]:
df = pd.read_csv('C://Users/siddh/Progamming/Projects/Data_Science_Projects/Newsgroup_Topic_Modeling/data/data.csv')
df.head()

Unnamed: 0,text,label,category
0,sure bashers pens fans pretty confused lack ki...,5,sports
1,brother market highperformance video card supp...,0,computers
2,finally said dream mediterranean new area grea...,2,politics
3,think scsi card dma transfers disks scsi card ...,0,computers
4,1 old jasmine drive cannot use new system unde...,0,computers


In [3]:
import joblib
label_mapping = joblib.load("C://Users/siddh/Progamming/Projects/Data_Science_Projects/Newsgroup_Topic_Modeling/data/label_mapping.joblib")

In [4]:
len(df)

18846

In [5]:
df[df['text'].isnull()==True]

Unnamed: 0,text,label,category
91,,3,religion
104,,3,religion
127,,5,sports
135,,2,politics
161,,0,computers
...,...,...,...
18609,,2,politics
18643,,6,vehicles
18652,,0,computers
18735,,0,computers


In [6]:
18846-542

18304

In [7]:
df = df.dropna(subset=['text']).reset_index(drop=True)

In [8]:
len(df)

18304

In [9]:
X = df['text']
y = df['label']

In [10]:
X.isnull().sum()

np.int64(0)

## Data split and Vectorizing

In [11]:
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.feature_extraction.text import TfidfVectorizer

In [12]:
vectorizer = TfidfVectorizer(
    max_features=10000,      # You can tune this
    ngram_range=(1, 2),     # Use unigrams and bigrams
    stop_words='english'    # Remove common stopwords
)

In [13]:
X_train, X_test, y_train, y_test = train_test_split(X,y, test_size=0.2)
X_train.shape, X_test.shape, y_train.shape, y_test.shape

((14643,), (3661,), (14643,), (3661,))

In [14]:
acc_score = pd.DataFrame(columns=['Model', 'Accuracy'])

## Mulinomial Naive Bayes 

In [15]:
from sklearn.pipeline import Pipeline

In [16]:
from sklearn.naive_bayes import MultinomialNB

In [17]:
clf_nb = MultinomialNB()
mnb_pipeline = Pipeline([
    ('tfidf',vectorizer),
    ('mnb',clf_nb)
]
)

In [18]:
param_grid_nb = {
    'mnb__alpha': [0.1, 0.5, 1.0]              # Smoothing parameter
}

In [19]:
grid_search_mnb = GridSearchCV(
    mnb_pipeline,
    param_grid_nb,
    cv=3,                   # 3-fold cross-validation
    n_jobs=-1,              # Use all CPU cores
    verbose=2,              # Prints progress
    scoring='accuracy'      # Optimize for accuracy
)

In [20]:
grid_search_mnb.fit(X_train, y_train)

Fitting 3 folds for each of 3 candidates, totalling 9 fits


0,1,2
,estimator,Pipeline(step...inomialNB())])
,param_grid,"{'mnb__alpha': [0.1, 0.5, ...]}"
,scoring,'accuracy'
,n_jobs,-1
,refit,True
,cv,3
,verbose,2
,pre_dispatch,'2*n_jobs'
,error_score,
,return_train_score,False

0,1,2
,input,'content'
,encoding,'utf-8'
,decode_error,'strict'
,strip_accents,
,lowercase,True
,preprocessor,
,tokenizer,
,analyzer,'word'
,stop_words,'english'
,token_pattern,'(?u)\\b\\w\\w+\\b'

0,1,2
,alpha,0.1
,force_alpha,True
,fit_prior,True
,class_prior,


In [21]:
print("✅ Best Parameters:", grid_search_mnb.best_params_)
print("✅ Best Cross-Validation Accuracy:", grid_search_mnb.best_score_)

✅ Best Parameters: {'mnb__alpha': 0.1}
✅ Best Cross-Validation Accuracy: 0.8310455507751144


In [22]:
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score

In [23]:
yhat_train_clf = grid_search_mnb.best_estimator_.predict(X_train)

print("Accuracy:", accuracy_score(y_train, yhat_train_clf))
print("\nClassification Report:\n", classification_report(y_train, yhat_train_clf))

Accuracy: 0.90193266407157

Classification Report:
               precision    recall  f1-score   support

           0       0.87      0.96      0.91      3785
           1       0.93      0.81      0.87       773
           2       0.88      0.91      0.89      2053
           3       0.91      0.89      0.90      1916
           4       0.89      0.85      0.87      3089
           5       0.97      0.93      0.95      1510
           6       0.94      0.90      0.92      1517

    accuracy                           0.90     14643
   macro avg       0.91      0.89      0.90     14643
weighted avg       0.90      0.90      0.90     14643



In [24]:
yhat_test_clf = grid_search_mnb.best_estimator_.predict(X_test)

print("Accuracy:", accuracy_score(y_test, yhat_test_clf))
print("\nClassification Report:\n", classification_report(y_test, yhat_test_clf))

Accuracy: 0.8443048347446053

Classification Report:
               precision    recall  f1-score   support

           0       0.84      0.92      0.88       984
           1       0.84      0.72      0.77       186
           2       0.83      0.82      0.83       503
           3       0.84      0.85      0.85       442
           4       0.78      0.78      0.78       743
           5       0.96      0.90      0.93       416
           6       0.89      0.78      0.83       387

    accuracy                           0.84      3661
   macro avg       0.85      0.83      0.84      3661
weighted avg       0.85      0.84      0.84      3661



## Logistic Regression

In [25]:
from sklearn.linear_model import LogisticRegression

In [26]:
lr_model = LogisticRegression(max_iter=1000, C=3, solver='lbfgs')
lr_pipeline = Pipeline([
    ('tfidf',vectorizer),
    ('lr',lr_model),
])

In [27]:
param_grid_lr = {
    'lr__C': [0.01, 0.1, 1, 10],              # Regularization strength
    'lr__penalty': ['l2'],                    # Use L2 regularization
    'lr__solver': ['lbfgs', 'liblinear'],     # Good solvers for text data
}


In [28]:
grid_search_lr = GridSearchCV(
    lr_pipeline,
    param_grid_lr,
    cv=3,                   # 3-fold cross-validation
    n_jobs=-1,              # Use all CPU cores
    verbose=2,              # Prints progress
    scoring='accuracy'      # Optimize for accuracy
)

In [29]:
grid_search_lr.fit(X_train, y_train)

Fitting 3 folds for each of 8 candidates, totalling 24 fits




0,1,2
,estimator,Pipeline(step..._iter=1000))])
,param_grid,"{'lr__C': [0.01, 0.1, ...], 'lr__penalty': ['l2'], 'lr__solver': ['lbfgs', 'liblinear']}"
,scoring,'accuracy'
,n_jobs,-1
,refit,True
,cv,3
,verbose,2
,pre_dispatch,'2*n_jobs'
,error_score,
,return_train_score,False

0,1,2
,input,'content'
,encoding,'utf-8'
,decode_error,'strict'
,strip_accents,
,lowercase,True
,preprocessor,
,tokenizer,
,analyzer,'word'
,stop_words,'english'
,token_pattern,'(?u)\\b\\w\\w+\\b'

0,1,2
,penalty,'l2'
,dual,False
,tol,0.0001
,C,10
,fit_intercept,True
,intercept_scaling,1
,class_weight,
,random_state,
,solver,'liblinear'
,max_iter,1000


In [30]:
print("✅ Best Parameters:", grid_search_lr.best_params_)
print("✅ Best Cross-Validation Accuracy:", grid_search_lr.best_score_)

✅ Best Parameters: {'lr__C': 10, 'lr__penalty': 'l2', 'lr__solver': 'liblinear'}
✅ Best Cross-Validation Accuracy: 0.8219627125589019


In [31]:
yhat_train_lr = grid_search_lr.best_estimator_.predict(X_train)

print("Accuracy:", accuracy_score(y_train, yhat_train_lr))
print("\nClassification Report:\n", classification_report(y_train, yhat_train_lr))

Accuracy: 0.9844977122174418

Classification Report:
               precision    recall  f1-score   support

           0       0.97      0.99      0.98      3785
           1       0.98      0.96      0.97       773
           2       0.99      0.99      0.99      2053
           3       0.99      0.98      0.99      1916
           4       0.99      0.98      0.98      3089
           5       1.00      0.99      0.99      1510
           6       0.99      0.98      0.99      1517

    accuracy                           0.98     14643
   macro avg       0.99      0.98      0.98     14643
weighted avg       0.98      0.98      0.98     14643



In [32]:
yhat_test_lr = grid_search_lr.best_estimator_.predict(X_test)

print("Accuracy:", accuracy_score(y_test, yhat_test_lr))
print("\nClassification Report:\n", classification_report(y_test, yhat_test_lr))

Accuracy: 0.8426659382682328

Classification Report:
               precision    recall  f1-score   support

           0       0.86      0.92      0.89       984
           1       0.82      0.70      0.76       186
           2       0.82      0.81      0.81       503
           3       0.83      0.81      0.82       442
           4       0.78      0.83      0.80       743
           5       0.96      0.89      0.92       416
           6       0.85      0.76      0.80       387

    accuracy                           0.84      3661
   macro avg       0.85      0.82      0.83      3661
weighted avg       0.84      0.84      0.84      3661



In [33]:
acc_score['Logistic_Regression'] = accuracy_score(y_test, yhat_test_lr)

## Linear SVC

In [34]:
from sklearn.svm import LinearSVC

In [35]:
svm = LinearSVC(C=1.0, max_iter=2000)
svm_pipeline = Pipeline([
    ('tfidf',vectorizer),
    ('svm',svm)
])

In [36]:
param_grid_svm = {
    'svm__C': [0.1, 1, 5],
}

In [37]:
grid_search_svm = GridSearchCV(
    svm_pipeline,
    param_grid_svm,
    cv=3,                   # 3-fold cross-validation
    n_jobs=-1,              # Use all CPU cores
    verbose=2,              # Prints progress
    scoring='accuracy'      # Optimize for accuracy
)

In [38]:
grid_search_svm.fit(X_train, y_train)

Fitting 3 folds for each of 3 candidates, totalling 9 fits


0,1,2
,estimator,Pipeline(step..._iter=2000))])
,param_grid,"{'svm__C': [0.1, 1, ...]}"
,scoring,'accuracy'
,n_jobs,-1
,refit,True
,cv,3
,verbose,2
,pre_dispatch,'2*n_jobs'
,error_score,
,return_train_score,False

0,1,2
,input,'content'
,encoding,'utf-8'
,decode_error,'strict'
,strip_accents,
,lowercase,True
,preprocessor,
,tokenizer,
,analyzer,'word'
,stop_words,'english'
,token_pattern,'(?u)\\b\\w\\w+\\b'

0,1,2
,penalty,'l2'
,loss,'squared_hinge'
,dual,'auto'
,tol,0.0001
,C,0.1
,multi_class,'ovr'
,fit_intercept,True
,intercept_scaling,1
,class_weight,
,verbose,0


In [39]:
print("✅ Best Parameters:", grid_search_svm.best_params_)
print("✅ Best Cross-Validation Accuracy:", grid_search_svm.best_score_)

✅ Best Parameters: {'svm__C': 0.1}
✅ Best Cross-Validation Accuracy: 0.8197773680256778


In [40]:
yhat_train_svc = grid_search_svm.best_estimator_.predict(X_train)

print("✅ Accuracy:", accuracy_score(y_train, yhat_train_svc))
print("\n📊 Classification Report:\n", classification_report(y_train, yhat_train_svc))

✅ Accuracy: 0.919005668237383

📊 Classification Report:
               precision    recall  f1-score   support

           0       0.89      0.97      0.93      3785
           1       0.94      0.78      0.85       773
           2       0.90      0.93      0.92      2053
           3       0.95      0.90      0.93      1916
           4       0.90      0.90      0.90      3089
           5       0.97      0.94      0.96      1510
           6       0.95      0.89      0.92      1517

    accuracy                           0.92     14643
   macro avg       0.93      0.90      0.91     14643
weighted avg       0.92      0.92      0.92     14643



In [41]:
yhat_test_svc = grid_search_svm.best_estimator_.predict(X_test)

print("✅ Accuracy:", accuracy_score(y_test, yhat_test_svc))
print("\n📊 Classification Report:\n", classification_report(y_test, yhat_test_svc))

✅ Accuracy: 0.8402075935536739

📊 Classification Report:
               precision    recall  f1-score   support

           0       0.83      0.94      0.88       984
           1       0.87      0.66      0.75       186
           2       0.82      0.81      0.82       503
           3       0.87      0.80      0.83       442
           4       0.76      0.82      0.79       743
           5       0.96      0.89      0.92       416
           6       0.90      0.74      0.81       387

    accuracy                           0.84      3661
   macro avg       0.86      0.81      0.83      3661
weighted avg       0.84      0.84      0.84      3661



In [43]:
acc_score = pd.DataFrame(columns=['Model', 'Accuracy'])

acc_score.loc[len(acc_score)] = ['Naive_Bayes', accuracy_score(y_test, yhat_test_clf)]
acc_score.loc[len(acc_score)] = ['Logistic_Regression', accuracy_score(y_test, yhat_test_lr)]
acc_score.loc[len(acc_score)] = ['Linear_SVC', accuracy_score(y_test, yhat_test_svc)]

In [44]:
acc_score

Unnamed: 0,Model,Accuracy
0,Naive_Bayes,0.844305
1,Logistic_Regression,0.842666
2,Linear_SVC,0.840208


## Final Model

In [45]:
best_params = grid_search_mnb.best_params_
final_pipe = Pipeline([
    ('tfidf', vectorizer),
    ('clf', MultinomialNB(
       alpha=best_params['mnb__alpha']
    ))
])

In [46]:
final_pipe.fit(df['text'], df['label'])

0,1,2
,steps,"[('tfidf', ...), ('clf', ...)]"
,transform_input,
,memory,
,verbose,False

0,1,2
,input,'content'
,encoding,'utf-8'
,decode_error,'strict'
,strip_accents,
,lowercase,True
,preprocessor,
,tokenizer,
,analyzer,'word'
,stop_words,'english'
,token_pattern,'(?u)\\b\\w\\w+\\b'

0,1,2
,alpha,0.1
,force_alpha,True
,fit_prior,True
,class_prior,


In [47]:
import joblib
joblib.dump(final_pipe, 'C://Users/siddh/Progamming/Projects/Data_Science_Projects/Newsgroup_Topic_Modeling/models/final_mnb_pipeline.pkl')

['C://Users/siddh/Progamming/Projects/Data_Science_Projects/Newsgroup_Topic_Modeling/models/final_mnb_pipeline.pkl']

In [52]:
# Later or in another script
model = joblib.load('C://Users/siddh/Progamming/Projects/Data_Science_Projects/Newsgroup_Topic_Modeling/models/final_mnb_pipeline.pkl')

# Predict on new text
new_text = ["Brad Pitt and Angelina Jolie got divorced"]
pred_label = model.predict(new_text)[0]
print("Predicted category:", label_mapping[pred_label])

Predicted category: sports
