In [7]:
#Working on a9a dataset on this.
#will be applying, naive bayes, decision tree and svm and see accuracy in each case.
#used library? scikit learn and numpy

In [8]:
from sklearn.datasets import load_svmlight_file
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.naive_bayes import GaussianNB
from sklearn.tree import DecisionTreeClassifier
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score
import numpy as np 

In [10]:
#dataset is in the same folder. 
X_sparse, y = load_svmlight_file('a9a.txt')
X = np.asarray(X_sparse.todense())


#splitting the dataset 70-30 as given in the question 
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)



In [11]:
# ========== Gaussian Naive Bayes ==========
pipe_gnb = Pipeline([
    ('scaler', StandardScaler()),
    ('gnb', GaussianNB())
])

pipe_gnb.fit(X_train, y_train)
gnb_accuracy = pipe_gnb.score(X_test, y_test)
print(f"Gaussian Naive Bayes Accuracy: {gnb_accuracy:.4f}")



Gaussian Naive Bayes Accuracy: 0.4609


In [12]:
# ========== Decision Tree Classifier ==========
dt_pipe = Pipeline([
    ('scaler', StandardScaler()),
    ('dt', DecisionTreeClassifier())
])

dt_param_grid = {
    'dt__criterion': ['gini', 'entropy'],
    'dt__max_depth': [10, 50, 100]
}

dt_grid_search = GridSearchCV(dt_pipe, dt_param_grid, cv=5)
dt_grid_search.fit(X_train, y_train)
best_dt_model = dt_grid_search.best_estimator_
dt_accuracy = best_dt_model.score(X_test, y_test)

print("Best Decision Tree Params:", dt_grid_search.best_params_)
print(f"Decision Tree Accuracy: {dt_accuracy:.4f}")



Best Decision Tree Params: {'dt__criterion': 'entropy', 'dt__max_depth': 10}
Decision Tree Accuracy: 0.8351


In [13]:
# ========== Support Vector Machine ==========
svm_pipe = Pipeline([
    ('scaler', StandardScaler()),
    ('svc', SVC())
])

svm_param_grid = [
    {'svc__kernel': ['linear']},
    {'svc__kernel': ['poly'], 'svc__degree': [2, 3]},
    {'svc__kernel': ['rbf'], 'svc__gamma': [0.001, 0.1, 2]}
]

svm_grid_search = GridSearchCV(svm_pipe, svm_param_grid, cv=5)
svm_grid_search.fit(X_train, y_train)
best_svm_model = svm_grid_search.best_estimator_
svm_accuracy = best_svm_model.score(X_test, y_test)

print("Best SVM Params:", svm_grid_search.best_params_)
print(f"SVM Accuracy: {svm_accuracy:.4f}")

#best model? 
best_model = max(
    [('GaussianNB', gnb_accuracy),
     ('Decision Tree', dt_accuracy),
     ('SVM', svm_accuracy)],
    key=lambda x: x[1]
)

print(f"\nBest classifier: {best_model[0]} with accuracy {best_model[1]:.4f}")


Best SVM Params: {'svc__kernel': 'linear'}
SVM Accuracy: 0.8484

Best classifier: SVM with accuracy 0.8484
