In [118]:
import urllib

import altair as alt
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import seaborn as sns
from IPython.display import HTML, display
import warnings
from warnings import filterwarnings
from scipy import stats

In [119]:
from sklearn import feature_selection as fs
from sklearn import metrics, preprocessing, tree
from sklearn.datasets import make_blobs
from sklearn.ensemble import ExtraTreesClassifier, RandomForestClassifier
from sklearn.linear_model import (
    LinearRegression,
    LogisticRegression,
    LogisticRegressionCV,
)
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import (
    RepeatedKFold,
    RepeatedStratifiedKFold,
    GridSearchCV,
    KFold,
    StratifiedKFold,
    cross_val_score,
    train_test_split,
)
from sklearn.naive_bayes import GaussianNB
from sklearn.neighbors import KNeighborsClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.preprocessing import PowerTransformer
from sklearn.tree import DecisionTreeClassifier

## Loading dataset & columns

In [120]:
FILE = "./abalone.data"
abalone = pd.read_csv(
    FILE,
    header=None,
    names=[
        "Sex",
        "Length",
        "Diameter",
        "Height",
        "Whole weight",
        "Shucked weight",
        "Viscera weight",
        "Shell weight",
        "Rings",
    ],
)
abalone.head()

Unnamed: 0,Sex,Length,Diameter,Height,Whole weight,Shucked weight,Viscera weight,Shell weight,Rings
0,M,0.455,0.365,0.095,0.514,0.2245,0.101,0.15,15
1,M,0.35,0.265,0.09,0.2255,0.0995,0.0485,0.07,7
2,F,0.53,0.42,0.135,0.677,0.2565,0.1415,0.21,9
3,M,0.44,0.365,0.125,0.516,0.2155,0.114,0.155,10
4,I,0.33,0.255,0.08,0.205,0.0895,0.0395,0.055,7


In [121]:
abalone.dtypes

Sex                object
Length            float64
Diameter          float64
Height            float64
Whole weight      float64
Shucked weight    float64
Viscera weight    float64
Shell weight      float64
Rings               int64
dtype: object

In [122]:
df_abalone = abalone.copy()

In [123]:
df_abalone.isnull().sum(axis=0)

Sex               0
Length            0
Diameter          0
Height            0
Whole weight      0
Shucked weight    0
Viscera weight    0
Shell weight      0
Rings             0
dtype: int64

In [124]:
display(HTML("<b>Table 1: Summary of continuous features</b>"))
display(df_abalone.describe(include=["int64", "float64"]))

Unnamed: 0,Length,Diameter,Height,Whole weight,Shucked weight,Viscera weight,Shell weight,Rings
count,4177.0,4177.0,4177.0,4177.0,4177.0,4177.0,4177.0,4177.0
mean,0.523992,0.407881,0.139516,0.828742,0.359367,0.180594,0.238831,9.933684
std,0.120093,0.09924,0.041827,0.490389,0.221963,0.109614,0.139203,3.224169
min,0.075,0.055,0.0,0.002,0.001,0.0005,0.0015,1.0
25%,0.45,0.35,0.115,0.4415,0.186,0.0935,0.13,8.0
50%,0.545,0.425,0.14,0.7995,0.336,0.171,0.234,9.0
75%,0.615,0.48,0.165,1.153,0.502,0.253,0.329,11.0
max,0.815,0.65,1.13,2.8255,1.488,0.76,1.005,29.0


## Encoding

In [125]:
# Encoding

df_abalone_all = df_abalone.drop(columns="Rings")
target = df_abalone["Rings"]
y = abalone["Rings"]
X = abalone.drop(columns="Rings")
df_abalone_all = pd.get_dummies(df_abalone_all)
df_abalone_all.columns

Index(['Length', 'Diameter', 'Height', 'Whole weight', 'Shucked weight',
       'Viscera weight', 'Shell weight', 'Sex_F', 'Sex_I', 'Sex_M'],
      dtype='object')

In [126]:
df_abalone_all.shape

(4177, 10)

In [127]:
target.value_counts()

9     689
10    634
8     568
11    487
7     391
12    267
6     259
13    203
14    126
5     115
15    103
16     67
17     58
4      57
18     42
19     32
20     26
3      15
21     14
23      9
22      6
27      2
24      2
1       1
26      1
29      1
2       1
25      1
Name: Rings, dtype: int64

## Scaling

In [128]:
abl_unscaled = df_abalone_all.values
abl_minmax = preprocessing.MinMaxScaler().fit_transform(abl_unscaled)

## Future Selection

### Perfomance

In [129]:
clf = KNeighborsClassifier(n_neighbors=1)
cv_method = RepeatedStratifiedKFold(n_splits=5, n_repeats=3, random_state=999)
scoring_metric = "accuracy"
cv_results_full = cross_val_score(
    estimator=clf, X=abl_minmax, y=target, cv=cv_method, scoring=scoring_metric
)
cv_results_full



array([0.19258373, 0.20813397, 0.2       , 0.21077844, 0.21916168,
       0.19856459, 0.20454545, 0.21317365, 0.19041916, 0.2011976 ,
       0.21172249, 0.20813397, 0.19161677, 0.18323353, 0.19401198])

In [130]:
cv_results_full.mean().round(4)

0.2018

### F-Score

In [131]:
cv_results_full.mean().round(4)

0.2018

### Future Selection Using F-Score

In [132]:
num_features = 10

In [133]:
fs_fit_fscore = fs.SelectKBest(fs.f_classif, k=num_features)
fs_fit_fscore.fit_transform(abl_minmax, target)
fs_indices_fscore = np.argsort(fs_fit_fscore.scores_)[::-1][0:num_features]
fs_indices_fscore

array([1, 0, 6, 2, 3, 5, 4, 8, 7, 9])

In [134]:
df_abalone_all.dtypes

Length            float64
Diameter          float64
Height            float64
Whole weight      float64
Shucked weight    float64
Viscera weight    float64
Shell weight      float64
Sex_F               uint8
Sex_I               uint8
Sex_M               uint8
dtype: object

In [135]:
best_features_fscore = df_abalone_all.columns[fs_indices_fscore].values
best_features_fscore

array(['Diameter', 'Length', 'Shell weight', 'Height', 'Whole weight',
       'Viscera weight', 'Shucked weight', 'Sex_I', 'Sex_F', 'Sex_M'],
      dtype=object)

In [136]:
feature_importances_fscore = fs_fit_fscore.scores_[fs_indices_fscore]
feature_importances_fscore

array([196.43610528, 188.50942458, 147.23694566, 124.42771441,
       113.71233973, 103.72143783,  80.79767682,  68.87047717,
        15.78351983,  10.62934022])

In [137]:
def plot_imp(best_features, scores, method_name, color):
    df = pd.DataFrame(
        {
            "features": best_features,
            "importances": scores,
        }
    )
    chart = (
        alt.Chart(df, width=700, title=method_name + " Feature Importances")
        .mark_bar(opacity=0.75, color=color)
        .encode(
            alt.X(
                "features",
                title="Feature",
                sort=None,
                axis=alt.AxisConfig(labelAngle=45),
            ),
            alt.Y("importances", title="Importance"),
        )
    )
    return chart

In [138]:
plot_imp(best_features_fscore, feature_importances_fscore, "F-Score", "purple")

  for col_name, dtype in df.dtypes.iteritems():


In [139]:
cv_results_fscore = cross_val_score(
    estimator=clf,
    X=abl_minmax[:, fs_indices_fscore[:8]],
    y=target,
    cv=cv_method,
    scoring=scoring_metric,
)
cv_results_fscore.mean().round(4)



0.2025

### Using Mutual Information

In [140]:
fs_fit_mutual_info = fs.SelectKBest(fs.mutual_info_classif, k=num_features)
fs_fit_mutual_info.fit_transform(abl_minmax, target)
fs_indices_mutual_info = np.argsort(fs_fit_mutual_info.scores_)[::-1][0:num_features]
best_features_mutual_info = df_abalone_all.columns[fs_indices_mutual_info].values
best_features_mutual_info

array(['Shell weight', 'Diameter', 'Length', 'Whole weight', 'Height',
       'Viscera weight', 'Shucked weight', 'Sex_I', 'Sex_F', 'Sex_M'],
      dtype=object)

In [141]:
feature_importances_mutual_info = fs_fit_mutual_info.scores_[fs_indices_mutual_info]
feature_importances_mutual_info

array([0.45214068, 0.40089486, 0.39122894, 0.39058293, 0.38334077,
       0.37867806, 0.33201782, 0.16617673, 0.03005267, 0.02794876])

In [142]:
plot_imp(
    best_features_mutual_info,
    feature_importances_mutual_info,
    "Mutual Information",
    "green",
)

  for col_name, dtype in df.dtypes.iteritems():


In [143]:
cv_results_mutual_info = cross_val_score(
    estimator=clf,
    X=abl_minmax[:, fs_indices_mutual_info[:8]],
    y=target,
    cv=cv_method,
    scoring=scoring_metric,
)
cv_results_mutual_info.mean().round(4)



0.2025

### RFI (Random Forest Importance)

In [144]:
model_rfi = RandomForestClassifier(n_estimators=100)
model_rfi.fit(abl_minmax, target)
fs_indices_rfi = np.argsort(model_rfi.feature_importances_)[::-1][0:num_features]

In [145]:
best_features_rfi = df_abalone_all.columns[fs_indices_rfi].values
best_features_rfi

array(['Shell weight', 'Shucked weight', 'Viscera weight', 'Whole weight',
       'Diameter', 'Length', 'Height', 'Sex_M', 'Sex_F', 'Sex_I'],
      dtype=object)

In [146]:
feature_importances_rfi = model_rfi.feature_importances_[fs_indices_rfi]
feature_importances_rfi

array([0.16404578, 0.15581694, 0.14949935, 0.14743962, 0.11598637,
       0.11312139, 0.10291125, 0.0184276 , 0.01742475, 0.01532695])

In [147]:
plot_imp(best_features_rfi, feature_importances_rfi, "Random Forest", "red")

  for col_name, dtype in df.dtypes.iteritems():


In [148]:
cv_results_rfi = cross_val_score(
    estimator=clf,
    X=abl_minmax[:, fs_indices_rfi[:5]],
    y=target,
    cv=cv_method,
    scoring=scoring_metric,
)
cv_results_rfi.mean().round(4)



0.2005

### Future selection comparison

##### Let's get best results of future selection

* FS = 0.2037
* MI = 0.2037
* RFI = 0.2009

##### Let's compare them

In [149]:
print(stats.ttest_rel(cv_results_rfi, cv_results_fscore))
print(stats.ttest_rel(cv_results_rfi, cv_results_mutual_info))
print(stats.ttest_rel(cv_results_mutual_info, cv_results_fscore))

Ttest_relResult(statistic=-0.5503904610814505, pvalue=0.5907259816708715)
Ttest_relResult(statistic=-0.5503904610814505, pvalue=0.5907259816708715)
Ttest_relResult(statistic=nan, pvalue=nan)


##### feature selection by RFI have the best cross-validation results. 
In this case, we will use the top 5 feature based on RFI.
Let's do a paired t-test for the cross-validation results of RFI feature combination and full set.

In [150]:
print(stats.ttest_rel(cv_results_rfi, cv_results_full))

Ttest_relResult(statistic=-0.3356906227909424, pvalue=0.7420818391368322)


In [151]:
df_abalone_all.columns[fs_indices_rfi[:5]]

Index(['Shell weight', 'Shucked weight', 'Viscera weight', 'Whole weight',
       'Diameter'],
      dtype='object')

In [152]:
abl_minmax_df = pd.DataFrame(data=abl_minmax, columns=df_abalone_all.columns)

In [153]:
abl_rfi = abl_minmax_df[df_abalone_all.columns[fs_indices_rfi[:5]].values]

In [154]:
abl_rfi = abl_rfi.values

### Hyperparameter tuning

In [155]:
cv_method = RepeatedStratifiedKFold(n_splits=10, n_repeats=3, random_state=4)

In [156]:
params_KNN = {"n_neighbors": [2, 4, 8, 16, 32, 64], "p": [1, 2]}

In [157]:
gs_KNN = GridSearchCV(
    estimator=KNeighborsClassifier(),
    param_grid=params_KNN,
    cv=cv_method,
    verbose=1,  # verbose: the higher, the more messages
    scoring="accuracy",
    return_train_score=True,
)

In [158]:
gs_KNN.fit(abl_rfi, target) 

Fitting 30 folds for each of 12 candidates, totalling 360 fits




In [159]:
gs_KNN.best_params_

{'n_neighbors': 64, 'p': 1}

In [160]:
gs_KNN.best_score_

0.2718060957932219

In [161]:
results_KNN = pd.DataFrame(gs_KNN.cv_results_["params"])

In [162]:
results_KNN["test_score"] = gs_KNN.cv_results_["mean_test_score"]

In [163]:
results_KNN["metric"] = results_KNN["p"].replace(
    [1, 2, 3], ["Manhattan", "Euclidean", "Minkowski"]
)
results_KNN

Unnamed: 0,n_neighbors,p,test_score,metric
0,2,1,0.21762,Manhattan
1,2,2,0.219139,Euclidean
2,4,1,0.216266,Manhattan
3,4,2,0.216986,Euclidean
4,8,1,0.239331,Manhattan
5,8,2,0.233585,Euclidean
6,16,1,0.253535,Manhattan
7,16,2,0.249698,Euclidean
8,32,1,0.264225,Manhattan
9,32,2,0.265745,Euclidean


In [164]:
alt.Chart(results_KNN, title="KNN Performance Comparison").mark_line(point=True).encode(
    alt.X("n_neighbors", title="Number of Neighbors"),
    alt.Y("test_score", title="Mean CV Score", scale=alt.Scale(zero=False)),
    color="metric",
)

  for col_name, dtype in df.dtypes.iteritems():


### Decision Tree

In [165]:
df_classifier = DecisionTreeClassifier(random_state=4)

params_DT = {
    'criterion': ['gini', 'entropy'],
    'max_depth': [1, 2, 3, 4, 5, 6, 7, 8],
    'min_samples_split': [2, 3]
    }
cv_method = RepeatedStratifiedKFold(
    n_splits=5,
    n_repeats=3,
    random_state=999
    )
gs_DT = GridSearchCV(
    estimator=df_classifier,
    param_grid=params_DT,
    cv=cv_method,
    verbose=1,
    scoring='accuracy'
    )
gs_DT.fit(abl_rfi, target);




Fitting 15 folds for each of 32 candidates, totalling 480 fits


In [166]:
gs_DT.best_params_                                                                                             

{'criterion': 'entropy', 'max_depth': 4, 'min_samples_split': 2}

In [167]:
gs_DT.best_score_

0.268451422513824

In [168]:
results_DT = pd.DataFrame(gs_DT.cv_results_['params'])
results_DT['test_score'] = gs_DT.cv_results_['mean_test_score']
results_DT.columns

Index(['criterion', 'max_depth', 'min_samples_split', 'test_score'], dtype='object')

In [169]:
alt.Chart(
    results_DT,
    title='DT Performance Comparison').mark_line(point=True).encode(
        alt.X('max_depth', title='Maximum Depth'),
        alt.Y('test_score', title='Mean CV Score', 
        aggregate='average', 
        scale=alt.Scale(zero=False)),
        color='criterion'
        )       

  for col_name, dtype in df.dtypes.iteritems():


In [170]:
df_classifier = RandomForestClassifier(random_state=4)
params_RF = {
    'criterion': ['gini', 'entropy'],
    'max_depth': [1, 2, 3, 4, 5, 6, 7, 8],
    'min_samples_split': [2, 3]
    }

In [171]:
cv_method = RepeatedStratifiedKFold(
    n_splits=5,
    n_repeats=3,
    random_state=999
    )

In [172]:
gs_RF = GridSearchCV(
    estimator=df_classifier,
    param_grid=params_RF,
    cv=cv_method,
    verbose=1,
    scoring='accuracy'
    )

In [173]:
gs_RF.fit(abl_rfi, target)

Fitting 15 folds for each of 32 candidates, totalling 480 fits




In [174]:
gs_RF.best_params_

{'criterion': 'entropy', 'max_depth': 7, 'min_samples_split': 2}

In [175]:
gs_RF.best_score_

0.2745184272603119

In [176]:
gs_RF.cv_results_['mean_test_score']

array([0.21833653, 0.21833653, 0.25943424, 0.25943424, 0.27180577,
       0.27180577, 0.27260388, 0.27268372, 0.27355987, 0.27411856,
       0.27204271, 0.2705267 , 0.26877288, 0.27036816, 0.26430431,
       0.26438348, 0.21434767, 0.21434767, 0.25688088, 0.25688088,
       0.27076832, 0.27076832, 0.27196497, 0.27196497, 0.27308207,
       0.27324175, 0.2723636 , 0.27172383, 0.27451843, 0.2688512 ,
       0.26669818, 0.26470141])

In [177]:
results_RF = pd.DataFrame(gs_RF.cv_results_['params'])
results_RF['test_score'] = gs_RF.cv_results_['mean_test_score']
results_RF.columns

Index(['criterion', 'max_depth', 'min_samples_split', 'test_score'], dtype='object')

In [178]:
alt.Chart(
    results_DT,
    title='DT Performance Comparison'
    ).mark_line(point=True).encode(
    alt.X('max_depth', title='Maximum Depth'),
    alt.Y('test_score', title='Mean CV Score', aggregate='average', scale=alt.Scale(zero=False
)),
color='criterion'
)

  for col_name, dtype in df.dtypes.iteritems():


### GaussianNB

In [179]:
np.logspace(0,-9, num=10)

array([1.e+00, 1.e-01, 1.e-02, 1.e-03, 1.e-04, 1.e-05, 1.e-06, 1.e-07,
       1.e-08, 1.e-09])

In [180]:
abl_power = PowerTransformer().fit_transform(abl_unscaled)
abl_power_df = pd.DataFrame(data = abl_power,columns = df_abalone_all.columns)
abl_power_rfi = abl_power_df[df_abalone_all.columns[fs_indices_rfi[:5]].values]
abl_power_rfi = abl_power_rfi.values

In [181]:
nb_classifier = GaussianNB()
params_NB = {'var_smoothing': np.logspace(0,-9, num=100)}
cv_method = RepeatedStratifiedKFold(
    n_splits=5,
    n_repeats=3,
    random_state=999)
gs_NB = GridSearchCV(
    estimator=nb_classifier,
    param_grid=params_NB,
    cv=cv_method,
    verbose=1,
    scoring='accuracy')
Data_transformed = PowerTransformer().fit_transform(abl_rfi)
gs_NB.fit(abl_power_rfi, target);

Fitting 15 folds for each of 100 candidates, totalling 1500 fits




In [182]:
gs_NB.best_params_

{'var_smoothing': 1.0}

In [183]:
gs_NB.best_score_

0.2617519983955534

In [184]:
results_NB = pd.DataFrame(gs_NB.cv_results_['params'])
results_NB['test_score'] = gs_NB.cv_results_['mean_test_score']

In [185]:
alt.Chart(results_NB,
title='NB Performance Comparison').mark_line(point=True).encode(
    alt.X('var_smoothing', title='Var. Smoothing'),
    alt.Y('test_score', title='Mean CV Score', scale=alt.Scale(zero=False))).interactive()

  for col_name, dtype in df.dtypes.iteritems():


### MLP

In [186]:
from sklearn.neural_network import MLPClassifier
df_classifier = MLPClassifier(max_iter = 100)
params_MLP = {
    'hidden_layer_sizes': [5, 10, 20],
    'alpha': [0.0001, 0.001],
    'solver': ['sgd', 'adam'],
    'learning_rate': ['constant','adaptive']}
cv_method = RepeatedStratifiedKFold(
    n_splits=5,
    n_repeats=3,
    random_state=999)
gs_MLP = GridSearchCV(
    estimator=df_classifier,
    param_grid=params_MLP,
    cv=cv_method,
    verbose=1,
    scoring='accuracy')


In [187]:
gs_MLP.fit(abl_rfi, target)

Fitting 15 folds for each of 24 candidates, totalling 360 fits




In [188]:
gs_MLP.best_params_

{'alpha': 0.001,
 'hidden_layer_sizes': 20,
 'learning_rate': 'constant',
 'solver': 'adam'}

In [189]:
gs_MLP.best_score_

0.26765703043673417

In [190]:
results_MLP = pd.DataFrame(gs_MLP.cv_results_['params'])
results_MLP['test_score'] = gs_MLP.cv_results_['mean_test_score']

In [191]:
results_MLP

Unnamed: 0,alpha,hidden_layer_sizes,learning_rate,solver,test_score
0,0.0001,5,constant,sgd,0.168221
1,0.0001,5,constant,adam,0.244753
2,0.0001,5,adaptive,sgd,0.171651
3,0.0001,5,adaptive,adam,0.236619
4,0.0001,10,constant,sgd,0.178994
5,0.0001,10,constant,adam,0.257441
6,0.0001,10,adaptive,sgd,0.181945
7,0.0001,10,adaptive,adam,0.256882
8,0.0001,20,constant,sgd,0.174127
9,0.0001,20,constant,adam,0.264224
