### Statistical Learning for Data Science 2 (229352)
#### Instructor: Donlapark Ponnoprat

#### [Course website](https://donlapark.pages.dev/229352/)

## Lab #4

In [1]:
from sklearn.datasets import fetch_20newsgroups
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import classification_report
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV

from sklearn.naive_bayes import MultinomialNB
from sklearn.pipeline import Pipeline

from scipy.stats import uniform

In [2]:
train = fetch_20newsgroups(subset='train')
test = fetch_20newsgroups(subset='test')

Xtrain = train.data[:3000]
ytrain = train.target[:3000]
Xtest = test.data[:500]
ytest = test.target[:500]

print("X:", len(Xtest))
print("y:", len(ytest))

X: 500
y: 500


### Naive Bayes [(Documentation)](https://scikit-learn.org/stable/modules/generated/sklearn.naive_bayes.MultinomialNB.html)

In [4]:
pipeline = Pipeline([('tfid',TfidfVectorizer(stop_words = 'english')),
                   ('nb', MultinomialNB())])

pipeline.fit(Xtrain, ytrain)
ypred = pipeline.predict(Xtest)
print(classification_report(ytest, ypred))

              precision    recall  f1-score   support

           0       0.67      0.38      0.48        21
           1       0.79      0.52      0.63        21
           2       0.58      0.69      0.63        26
           3       0.74      0.68      0.71        34
           4       0.72      0.85      0.78        34
           5       0.88      0.81      0.84        26
           6       1.00      0.73      0.84        22
           7       0.70      1.00      0.82        28
           8       0.90      0.82      0.86        33
           9       0.88      0.84      0.86        25
          10       0.82      1.00      0.90        27
          11       0.79      0.95      0.86        20
          12       0.59      0.54      0.57        24
          13       0.75      0.78      0.77        23
          14       0.87      0.71      0.78        28
          15       0.53      0.90      0.67        29
          16       0.50      0.95      0.66        21
          17       0.94    

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


### Random Search Cross-Validation [(Documentation)](https://scikit-learn.org/stable/modules/generated/sklearn.model_selection.RandomizedSearchCV.html)

### Uniform distribution in `Scipy` [(Documentation)](https://docs.scipy.org/doc/scipy/reference/generated/scipy.stats.uniform.html)

In [7]:
parameters = {'nb__alpha': uniform(loc = 0, scale = 10)}

clf = RandomizedSearchCV(pipeline,parameters,n_iter=7,verbose=100)
clf.fit(Xtrain,ytrain)


Fitting 5 folds for each of 7 candidates, totalling 35 fits
[CV 1/5; 1/7] START nb__alpha=3.243344540416917.................................
[CV 1/5; 1/7] END ..nb__alpha=3.243344540416917;, score=0.755 total time=   0.9s
[CV 2/5; 1/7] START nb__alpha=3.243344540416917.................................
[CV 2/5; 1/7] END ..nb__alpha=3.243344540416917;, score=0.762 total time=   0.9s
[CV 3/5; 1/7] START nb__alpha=3.243344540416917.................................
[CV 3/5; 1/7] END ..nb__alpha=3.243344540416917;, score=0.745 total time=   0.8s
[CV 4/5; 1/7] START nb__alpha=3.243344540416917.................................
[CV 4/5; 1/7] END ..nb__alpha=3.243344540416917;, score=0.728 total time=   0.9s
[CV 5/5; 1/7] START nb__alpha=3.243344540416917.................................
[CV 5/5; 1/7] END ..nb__alpha=3.243344540416917;, score=0.767 total time=   0.9s
[CV 1/5; 2/7] START nb__alpha=0.6974695647233842................................
[CV 1/5; 2/7] END .nb__alpha=0.6974695647233842;,

In [8]:
ypred = clf.predict(Xtest)
print(classification_report(ytest, ypred))

              precision    recall  f1-score   support

           0       0.71      0.48      0.57        21
           1       0.68      0.62      0.65        21
           2       0.63      0.65      0.64        26
           3       0.71      0.71      0.71        34
           4       0.83      0.88      0.86        34
           5       0.89      0.65      0.76        26
           6       1.00      0.73      0.84        22
           7       0.74      1.00      0.85        28
           8       0.93      0.85      0.89        33
           9       0.91      0.84      0.88        25
          10       0.82      1.00      0.90        27
          11       0.80      1.00      0.89        20
          12       0.64      0.67      0.65        24
          13       0.83      0.83      0.83        23
          14       0.85      0.82      0.84        28
          15       0.59      0.93      0.72        29
          16       0.51      0.95      0.67        21
          17       0.94    

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


#### Exercise

1. For the Naive Bayes model, use grid search 5-fold cross-validation across different values of `alpha` to find the best model.

2. For the best value of `alpha`, compute the `f1_macro` score on the test set.
* What value of `alpha` did you obtain?
* What is the model's `f1_macro` score?

3. Repeat Exercise 1 and 2 for **random search** 5-fold cross validation across different values of `alpha`. Compute the `f1_macro` score on the test set.
* What value of `alpha` did you obtain?
* Did you get a better `f1_macro` score compared to grid search in Exercise 2?

1.
Best alpha (Grid Search): 0.001

Test f1_macro: 0.7442025675112527

In [13]:
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB

pipeline = Pipeline([
    ("tfidf", TfidfVectorizer()),
    ("nb", MultinomialNB())
])



In [14]:
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import f1_score

param_grid = {
    "tfidf__ngram_range": [(1,1), (1,2)],
    "nb__alpha": [1e-3, 1e-2, 1e-1, 1, 10]
}

grid = GridSearchCV(
    pipeline,
    param_grid,
    scoring="f1_macro",
    cv=5,
    n_jobs=-1
)

grid.fit(Xtrain, ytrain)


In [16]:
best_alpha_grid = grid.best_params_["nb__alpha"]
print("Best alpha (Grid Search):", best_alpha_grid)


Best alpha (Grid Search): 0.001


In [17]:
y_pred_grid = grid.best_estimator_.predict(Xtest)
f1_grid = f1_score(ytest, y_pred_grid, average="macro")

print("Test f1_macro (Grid Search):", f1_grid)


Test f1_macro (Grid Search): 0.7442025675112527


2.
Best alpha (Random Search): 0.004037017258596553

Test f1_macro: 0.7599429519124723

In [18]:
from sklearn.model_selection import RandomizedSearchCV
import numpy as np

param_dist = {
    "tfidf__ngram_range": [(1,1), (1,2)],
    "nb__alpha": np.logspace(-3, 1, 100)
}

random_search = RandomizedSearchCV(
    pipeline,
    param_distributions=param_dist,
    n_iter=20,
    scoring="f1_macro",
    cv=5,
    random_state=42,
    n_jobs=-1
)

random_search.fit(Xtrain, ytrain)


In [19]:
best_alpha_random = random_search.best_params_["nb__alpha"]
print("Best alpha (Random Search):", best_alpha_random)


Best alpha (Random Search): 0.004037017258596553


In [20]:
y_pred_random = random_search.best_estimator_.predict(Xtest)
f1_random = f1_score(ytest, y_pred_random, average="macro")

print("Test f1_macro (Random Search):", f1_random)


Test f1_macro (Random Search): 0.7599429519124723


3.
การใช้ Random Search ได้ค่า alpha = 0.005336699231206312 และได้ค่า f1_macro เท่ากับ 0.7570828728744793.
เมื่อเปรียบเทียบกับ Grid Search ซึ่งได้ค่า f1_macro เท่ากับ 0.7442025675112527 พบว่า Random Search (ให้ผลดีกว่า) Grid Search

In [21]:
from sklearn.model_selection import RandomizedSearchCV
from sklearn.metrics import f1_score
import numpy as np

param_dist = {
    "nb__alpha": np.logspace(-3, 1, 100)
}

random_search = RandomizedSearchCV(
    estimator=pipeline,
    param_distributions=param_dist,
    n_iter=20,
    cv=5,
    scoring="f1_macro",
    random_state=42,
    n_jobs=-1
)

random_search.fit(Xtrain, ytrain)

best_alpha_random = random_search.best_params_["nb__alpha"]

y_pred_random = random_search.best_estimator_.predict(Xtest)
f1_random = f1_score(ytest, y_pred_random, average="macro")

print("Best alpha (Random Search):", best_alpha_random)
print("Test f1_macro:", f1_random)


Best alpha (Random Search): 0.005336699231206312
Test f1_macro: 0.7570828728744793
