# Dataset: load_linnerud

## 1. Dataset analysis

In [2]:
from sklearn.datasets import load_linnerud
import pandas as pandas
import seaborn as seaborn
import matplotlib.pyplot as matplot
import numpy as np

In [14]:
linnerud = load_linnerud()
print("BRIEF DESCRIPTION OF THE DATASET: ")
print(linnerud.DESCR)
print()

print("ATTRIBUTES: ")
print(dir(linnerud))
print()

X = linnerud.data
y = linnerud.target
fn = linnerud.feature_names
tn = linnerud.target_names

print("DATA:")
show_data_X = pandas.DataFrame(data=X, columns=fn)
show_data_y = pandas.DataFrame(data=y, columns=tn)
frames = [show_data_X, show_data_y]
show_data = pandas.concat(frames, axis=1)
show_data

BRIEF DESCRIPTION OF THE DATASET: 
.. _linnerrud_dataset:

Linnerrud dataset
-----------------

**Data Set Characteristics:**

    :Number of Instances: 20
    :Number of Attributes: 3
    :Missing Attribute Values: None

The Linnerud dataset is a multi-output regression dataset. It consists of three
exercise (data) and three physiological (target) variables collected from
twenty middle-aged men in a fitness club:

- *physiological* - CSV containing 20 observations on 3 physiological variables:
   Weight, Waist and Pulse.
- *exercise* - CSV containing 20 observations on 3 exercise variables:
   Chins, Situps and Jumps.

|details-start|
**References**
|details-split|

* Tenenhaus, M. (1998). La regression PLS: theorie et pratique. Paris:
  Editions Technic.

|details-end|

ATTRIBUTES: 
['DESCR', 'data', 'data_filename', 'data_module', 'feature_names', 'frame', 'target', 'target_filename', 'target_names']

DATA:


Unnamed: 0,Chins,Situps,Jumps,Weight,Waist,Pulse
0,5.0,162.0,60.0,191.0,36.0,50.0
1,2.0,110.0,60.0,189.0,37.0,52.0
2,12.0,101.0,101.0,193.0,38.0,58.0
3,12.0,105.0,37.0,162.0,35.0,62.0
4,13.0,155.0,58.0,189.0,35.0,46.0
5,4.0,101.0,42.0,182.0,36.0,56.0
6,8.0,101.0,38.0,211.0,38.0,56.0
7,6.0,125.0,40.0,167.0,34.0,60.0
8,15.0,200.0,40.0,176.0,31.0,74.0
9,17.0,251.0,250.0,154.0,33.0,56.0


**Further data analysis**

In [15]:
show_data.describe()

Unnamed: 0,Chins,Situps,Jumps,Weight,Waist,Pulse
count,20.0,20.0,20.0,20.0,20.0,20.0
mean,9.45,145.55,70.3,178.6,35.4,56.1
std,5.286278,62.566575,51.27747,24.690505,3.201973,7.210373
min,1.0,50.0,25.0,138.0,31.0,46.0
25%,4.75,101.0,39.5,160.75,33.0,51.5
50%,11.5,122.5,54.0,176.0,35.0,55.0
75%,13.25,210.0,85.25,191.5,37.0,60.5
max,17.0,251.0,250.0,247.0,46.0,74.0


# 2. Model fitting

In [16]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, shuffle=True, random_state=23)

In [17]:
from sklearn.model_selection import GridSearchCV
from sklearn.naive_bayes import GaussianNB
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.discriminant_analysis import QuadraticDiscriminantAnalysis
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import AdaBoostClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.neighbors import KNeighborsClassifier

**PERCEPTRON**

In [18]:
def perceptron(X, y, margin=0.1, learning_rate=1.0, max_iters=200):
    N, D = X.shape
    Y = np.unique(y)
    C = Y.size
    weights = np.zeros((1+D, C))
    
    for iteration in range(1, max_iters + 1):
        errors = 0
        
        for n in range(N):
            xn = np.array([1, *X[n, :]])
            cn = np.squeeze(np.where(Y==y[n]))
            gn = weights[:,cn].T @ xn
            err = False
            
            for c in np.arange(C):
                if c != cn and weights[:,c].T @ xn + margin >= gn:
                    weights[:, c] = weights[:, c] - learning_rate*xn; err = True
            if err:
                weights[:, cn] = weights[:, cn] + learning_rate*xn
                errors = errors + 1
                
        if errors == 0:
            break

    return weights

In [19]:
%%timeit -n1 -r1

best_precision = 0
best_params = [0,0,0]

for margin in (0.01, 0.1, 0.3, 0.5, 0.7):
    for learning_rate in (0.1, 0.5, 1, 2):
        for max_iters in (100, 200, 500, 1000):
            W = perceptron(X_train, y_train, margin, learning_rate, max_iters)

            X_testh = np.hstack([np.ones((len(X_test), 1)), X_test])
            y_test_pred  = np.argmax(X_testh @ W, axis=1).reshape(-1, 1)
            err_test = np.count_nonzero(y_test_pred != y_test) / len(X_test)
            precision = 100 - err_test
            
            if precision > best_precision:
                best_precision = precision
                best_params = [margin, learning_rate, max_iters]

print(f'Accuracy: {best_precision:.1f}% with margin {best_params[0]}, learning rate {best_params[1]} and {best_params[2]} maximum iterations')

ValueError: operands could not be broadcast together with shapes (32,) (3,) 

**NAIVE BAYES** 

In [20]:
%%timeit -n1 -r1
nb = GaussianNB()

Gnb = {"var_smoothing": [1e-9, 0.001, 0.01, 0.1, 0.3, 0.5, 0.7, 0.9]}
GSnb = GridSearchCV(nb, Gnb, scoring='accuracy', refit=True, cv=5)

acc = GSnb.fit(X_train, y_train).score(X_test, y_test)
print(f'Accuracy: {acc:.1%} con {GSnb.best_params_}')

ValueError: 
All the 40 fits failed.
It is very likely that your model is misconfigured.
You can try to debug the error by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
8 fits failed with the following error:
Traceback (most recent call last):
  File "C:\Users\Usuario\AppData\Local\Programs\Python\Python311\Lib\site-packages\sklearn\model_selection\_validation.py", line 729, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "C:\Users\Usuario\AppData\Local\Programs\Python\Python311\Lib\site-packages\sklearn\base.py", line 1152, in wrapper
    return fit_method(estimator, *args, **kwargs)
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "C:\Users\Usuario\AppData\Local\Programs\Python\Python311\Lib\site-packages\sklearn\naive_bayes.py", line 262, in fit
    y = self._validate_data(y=y)
        ^^^^^^^^^^^^^^^^^^^^^^^^
  File "C:\Users\Usuario\AppData\Local\Programs\Python\Python311\Lib\site-packages\sklearn\base.py", line 607, in _validate_data
    out = _check_y(y, **check_params)
          ^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "C:\Users\Usuario\AppData\Local\Programs\Python\Python311\Lib\site-packages\sklearn\utils\validation.py", line 1183, in _check_y
    y = column_or_1d(y, warn=True)
        ^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "C:\Users\Usuario\AppData\Local\Programs\Python\Python311\Lib\site-packages\sklearn\utils\validation.py", line 1244, in column_or_1d
    raise ValueError(
ValueError: y should be a 1d array, got an array of shape (12, 3) instead.

--------------------------------------------------------------------------------
32 fits failed with the following error:
Traceback (most recent call last):
  File "C:\Users\Usuario\AppData\Local\Programs\Python\Python311\Lib\site-packages\sklearn\model_selection\_validation.py", line 729, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "C:\Users\Usuario\AppData\Local\Programs\Python\Python311\Lib\site-packages\sklearn\base.py", line 1152, in wrapper
    return fit_method(estimator, *args, **kwargs)
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "C:\Users\Usuario\AppData\Local\Programs\Python\Python311\Lib\site-packages\sklearn\naive_bayes.py", line 262, in fit
    y = self._validate_data(y=y)
        ^^^^^^^^^^^^^^^^^^^^^^^^
  File "C:\Users\Usuario\AppData\Local\Programs\Python\Python311\Lib\site-packages\sklearn\base.py", line 607, in _validate_data
    out = _check_y(y, **check_params)
          ^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "C:\Users\Usuario\AppData\Local\Programs\Python\Python311\Lib\site-packages\sklearn\utils\validation.py", line 1183, in _check_y
    y = column_or_1d(y, warn=True)
        ^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "C:\Users\Usuario\AppData\Local\Programs\Python\Python311\Lib\site-packages\sklearn\utils\validation.py", line 1244, in column_or_1d
    raise ValueError(
ValueError: y should be a 1d array, got an array of shape (13, 3) instead.


**LINEAR DISCRIMINANT ANALYSIS**

In [10]:
%%timeit -n1 -r1
lda = LinearDiscriminantAnalysis()

Glda = {"solver": ['svd', 'lsqr', 'eigen'], "n_components": [1,2], "tol": [1e-5, 1e-4, 0.001, 0.01, 0.1, 0.3, 0.5, 0.7, 0.9]}
GSlda = GridSearchCV(lda, Glda, scoring='accuracy', refit=True, cv=5)

acc = GSlda.fit(X_train, y_train).score(X_test, y_test)
print(f'Accuracy: {acc:.1%} con {GSlda.best_params_}')

Accuracy: 100.0% con {'n_components': 1, 'solver': 'svd', 'tol': 1e-05}
1.32 s ± 0 ns per loop (mean ± std. dev. of 1 run, 1 loop each)


**QUADRATIC DISCRIMINANT ANALYSIS**

In [11]:
%%timeit -n1 -r1
import warnings; warnings.filterwarnings('ignore')

qda = QuadraticDiscriminantAnalysis()

Gqda = {"tol": [1e-5, 1e-4, 0.001, 0.01, 0.1, 0.3, 0.5, 0.7, 0.9]}
GSqda = GridSearchCV(qda, Gqda, scoring='accuracy', refit=True, cv=5)

acc = GSqda.fit(X_train, y_train).score(X_test, y_test)
print(f'Accuracy: {acc:.1%} con {GSqda.best_params_}')

Accuracy: 97.2% con {'tol': 1e-05}
108 ms ± 0 ns per loop (mean ± std. dev. of 1 run, 1 loop each)


**LOGISTIC REGRESSION**

In [12]:
%%timeit -n1 -r1

log_reg = LogisticRegression(random_state=23)

Glogreg = {"penalty": ['l1', 'l2', None], "tol": [0.001, 0.01, 0.1], "solver": ['lbfgs', 'liblinear', 'newton-cg'], "multi_class": ['auto', 'multinomial'], "max_iter": [10, 50, 100], "n_jobs": [1,2,4], "C": [0.001, 0.1, 1, 10, 100]}
GSlogreg = GridSearchCV(log_reg, Glogreg, scoring='accuracy', refit=True, cv=5)

acc = GSlogreg.fit(X_train, y_train).score(X_test, y_test)
print(f'Accuracy: {acc:.1%} con {GSlogreg.best_params_}')

Accuracy: 100.0% con {'C': 100, 'max_iter': 100, 'multi_class': 'auto', 'n_jobs': 1, 'penalty': 'l1', 'solver': 'liblinear', 'tol': 0.001}
4min 36s ± 0 ns per loop (mean ± std. dev. of 1 run, 1 loop each)


**DECISION TREE CLASSIFIER**

In [13]:
%%timeit -n1 -r1

dtc = DecisionTreeClassifier(random_state=23)

Gdtc = {"criterion": ['gini', 'entropy', 'log_loss'], "max_depth": [1,3,5,7,10,20], "min_samples_split": [2,3,4,5]}
GSdtc = GridSearchCV(dtc, Gdtc, scoring='accuracy', refit=True, cv=5)

acc = GSdtc.fit(X_train, y_train).score(X_test, y_test)
print(f'Accuracy: {acc:.1%} con {GSdtc.best_params_}')

Accuracy: 97.2% con {'criterion': 'gini', 'max_depth': 3, 'min_samples_split': 2}
1.47 s ± 0 ns per loop (mean ± std. dev. of 1 run, 1 loop each)


**RANDOM FOREST**

In [14]:
%%timeit -n1 -r1

rfc = RandomForestClassifier(random_state=23)

Grfc = {"n_estimators": [1,5,10,25,50], "criterion": ['gini', 'entropy', 'log_loss'], "bootstrap": [True,False], "max_depth": [1,3,5,7,10,20], "min_samples_split": [2,3,4,5]}
GSrfc = GridSearchCV(rfc, Grfc, scoring='accuracy', refit=True, cv=5)

acc = GSrfc.fit(X_train, y_train).score(X_test, y_test)
print(f'Accuracy: {acc:.1%} con {GSrfc.best_params_}')

Accuracy: 100.0% con {'bootstrap': True, 'criterion': 'entropy', 'max_depth': 5, 'min_samples_split': 3, 'n_estimators': 50}
1min 23s ± 0 ns per loop (mean ± std. dev. of 1 run, 1 loop each)


**ADABOOST CLASSIFIER**

In [15]:
%%timeit -n1 -r1

abc = AdaBoostClassifier(random_state=23)

Gabc = {"n_estimators": [2,5,10,20,50,100], "learning_rate": [0.01,0.1,0.3,0.5,0.7,0.9,0.99,1]}
GSabc = GridSearchCV(abc, Gabc, scoring='accuracy', refit=True, cv=5)

acc = (GSabc.fit(X_train, y_train).score(X_test, y_test))
print(f'Accuracy: {acc:.1%} con {GSabc.best_params_}')

Accuracy: 100.0% con {'learning_rate': 0.7, 'n_estimators': 50}
15.5 s ± 0 ns per loop (mean ± std. dev. of 1 run, 1 loop each)


**GRADIENT BOOSTING CLASSIFIER**

In [16]:
%%timeit -n1 -r1

gbc = GradientBoostingClassifier(random_state=23)

Ggbc = {"n_estimators": [2,5,10,20,50], "learning_rate": [0.01,0.1,0.3,0.5,0.7,0.9], "criterion": ['friedman_mse', 'squared_error'], "max_depth": [1,3,5,7,10], "min_samples_split": [2,3,4]}
GSgbc = GridSearchCV(gbc, Ggbc, scoring='accuracy', refit=True, cv=5)

acc = (GSgbc.fit(X_train, y_train).score(X_test, y_test))
print(f'Accuracy: {acc:.1%} con {GSgbc.best_params_}')

Accuracy: 100.0% con {'criterion': 'friedman_mse', 'learning_rate': 0.5, 'max_depth': 1, 'min_samples_split': 2, 'n_estimators': 20}
6min 27s ± 0 ns per loop (mean ± std. dev. of 1 run, 1 loop each)


**KNEIGHBORS CLASSIFIER**

In [17]:
%%timeit -n1 -r1

knc = KNeighborsClassifier()

Gknc = {'n_neighbors': [1,2,5,10,20], 'weights':['uniform', 'distance'], 'leaf_size': [5,10,20,30,50], 'p': [1,2,3,4,5]}
GSknc = GridSearchCV(knc, Gknc, scoring='accuracy', refit=True, cv=5)

acc = (GSknc.fit(X_train, y_train).score(X_test, y_test))
print(f'Accuracy: {acc:.1%} con {GSknc.best_params_}')

Accuracy: 80.6% con {'leaf_size': 5, 'n_neighbors': 1, 'p': 1, 'weights': 'uniform'}
4.89 s ± 0 ns per loop (mean ± std. dev. of 1 run, 1 loop each)
