<a href="https://colab.research.google.com/github/PrachitiSParulekar/ML/blob/main/Day_7_Naive_Bayes.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
import kagglehub

# Download latest version
path = kagglehub.dataset_download("sunilthite/text-document-classification-dataset")

print("Path to dataset files:", path)

Downloading from https://www.kaggle.com/api/v1/datasets/download/sunilthite/text-document-classification-dataset?dataset_version_number=1...


100%|██████████| 1.85M/1.85M [00:00<00:00, 76.4MB/s]

Extracting files...
Path to dataset files: /root/.cache/kagglehub/datasets/sunilthite/text-document-classification-dataset/versions/1





In [3]:
import os

# List files in the downloaded directory
print(os.listdir(path))

['df_file.csv']


In [4]:
import os
import pandas as pd

data = pd.read_csv(os.path.join(path, "df_file.csv"))
data.head()

Unnamed: 0,Text,Label
0,Budget to set scene for election\n \n Gordon B...,0
1,Army chiefs in regiments decision\n \n Militar...,0
2,Howard denies split over ID cards\n \n Michael...,0
3,Observers to monitor UK election\n \n Minister...,0
4,Kilroy names election seat target\n \n Ex-chat...,0


In [5]:
data.shape

(2225, 2)

In [6]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2225 entries, 0 to 2224
Data columns (total 2 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   Text    2225 non-null   object
 1   Label   2225 non-null   int64 
dtypes: int64(1), object(1)
memory usage: 34.9+ KB


In [7]:
#text preprocessing
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer

X = data['Text']
y = data['Label']

# Split data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Convert text to feature vectors
vectorizer = CountVectorizer(stop_words='english')
X_train_vec = vectorizer.fit_transform(X_train)
X_test_vec = vectorizer.transform(X_test)


In [13]:
X_train_vec.shape

(1780, 26485)

In [14]:
X_test_vec.shape

(445, 26485)

In [15]:
from sklearn.naive_bayes import MultinomialNB

model = MultinomialNB()
model.fit(X_train_vec, y_train)

In [16]:
from sklearn.metrics import accuracy_score, classification_report

y_pred = model.predict(X_test_vec)
print("Accuracy:", accuracy_score(y_test, y_pred))
print(classification_report(y_test, y_pred))

Accuracy: 0.9730337078651685
              precision    recall  f1-score   support

           0       0.96      0.97      0.96        92
           1       1.00      0.99      0.99        98
           2       0.96      0.97      0.97        77
           3       0.96      0.99      0.97        76
           4       0.98      0.95      0.97       102

    accuracy                           0.97       445
   macro avg       0.97      0.97      0.97       445
weighted avg       0.97      0.97      0.97       445



In [17]:
from sklearn.model_selection import GridSearchCV
from sklearn.naive_bayes import MultinomialNB

# Define parameter grid (example: alpha smoothing)
param_grid = {
    'alpha': [0.1, 0.5, 1.0, 2.0, 5.0]
}
nb = MultinomialNB()

# 5-fold cross-validation
grid_search = GridSearchCV(nb, param_grid, cv=5, scoring='accuracy')
grid_search.fit(X_train_vec, y_train)

print("Best params:", grid_search.best_params_)
print("Best cross-validation score:", grid_search.best_score_)

# Evaluate on test set
y_pred = grid_search.predict(X_test_vec)
from sklearn.metrics import classification_report, accuracy_score
print("Test Accuracy:", accuracy_score(y_test, y_pred))
print(classification_report(y_test, y_pred))

Best params: {'alpha': 0.5}
Best cross-validation score: 0.9752808988764045
Test Accuracy: 0.9730337078651685
              precision    recall  f1-score   support

           0       0.96      0.97      0.96        92
           1       1.00      0.99      0.99        98
           2       0.96      0.97      0.97        77
           3       0.96      0.99      0.97        76
           4       0.98      0.95      0.97       102

    accuracy                           0.97       445
   macro avg       0.97      0.97      0.97       445
weighted avg       0.97      0.97      0.97       445



In [18]:
#advanced features
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_selection import SelectKBest, chi2

# Try TF-IDF with n-grams
vectorizer = TfidfVectorizer(stop_words='english', ngram_range=(1,2), max_features=5000)
X_train_vec = vectorizer.fit_transform(X_train)
X_test_vec = vectorizer.transform(X_test)

# Optional: Select top 1000 features with chi-squared test
selector = SelectKBest(chi2, k=1000)
X_train_sel = selector.fit_transform(X_train_vec, y_train)
X_test_sel = selector.transform(X_test_vec)

# Train again using optimal alpha from grid search
nb = MultinomialNB(alpha=grid_search.best_params_['alpha'])
nb.fit(X_train_sel, y_train)
y_pred = nb.predict(X_test_sel)
print("Test Accuracy with advanced features:", accuracy_score(y_test, y_pred))
print(classification_report(y_test, y_pred))

Test Accuracy with advanced features: 0.9685393258426966
              precision    recall  f1-score   support

           0       0.98      0.97      0.97        92
           1       0.99      0.99      0.99        98
           2       0.96      0.94      0.95        77
           3       0.94      1.00      0.97        76
           4       0.97      0.95      0.96       102

    accuracy                           0.97       445
   macro avg       0.97      0.97      0.97       445
weighted avg       0.97      0.97      0.97       445

