#Task:

- Load the Abalone dataset using pd.read_csv from 'https://archive.ics.uci.edu/ml/machine-learning-databases/abalone/abalone.data'

- Use column_names as : ['Sex', 'Length', 'Diameter', 'Height', 'Whole weight', 'Shucked weight', 'Viscera weight', 'Shell weight', 'Rings']

- How many samples are there?

In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np

In [None]:
column_names = ['Sex', 'Length', 'Diameter', 'Height', 'Whole weight', 'Shucked weight', 'Viscera weight', 'Shell weight', 'Rings']
abalone_data = pd.read_csv('https://archive.ics.uci.edu/ml/machine-learning-databases/abalone/abalone.data', header=None, names=column_names)


In [None]:
abalone_data.shape

(4177, 9)

#Task

- How many numerical and how many categorical features are there?

- Divide the data into X and y ('Rings' is the target).

- Divide X and y into train and test data, using test_size = 0.2 (random_state = 0)

In [None]:
abalone_data.head(10)

Unnamed: 0,Sex,Length,Diameter,Height,Whole weight,Shucked weight,Viscera weight,Shell weight,Rings
0,M,0.455,0.365,0.095,0.514,0.2245,0.101,0.15,15
1,M,0.35,0.265,0.09,0.2255,0.0995,0.0485,0.07,7
2,F,0.53,0.42,0.135,0.677,0.2565,0.1415,0.21,9
3,M,0.44,0.365,0.125,0.516,0.2155,0.114,0.155,10
4,I,0.33,0.255,0.08,0.205,0.0895,0.0395,0.055,7
5,I,0.425,0.3,0.095,0.3515,0.141,0.0775,0.12,8
6,F,0.53,0.415,0.15,0.7775,0.237,0.1415,0.33,20
7,F,0.545,0.425,0.125,0.768,0.294,0.1495,0.26,16
8,M,0.475,0.37,0.125,0.5095,0.2165,0.1125,0.165,9
9,F,0.55,0.44,0.15,0.8945,0.3145,0.151,0.32,19


In [None]:
X = abalone_data.iloc[:, :-1]
y = abalone_data.iloc[:, -1]


In [None]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0)

#Task

- Create two separate lists, one for numerical feature names, another for categorical features names.

- Create a pipeline for numerical features that replaces missing values (i.e., 0 values) with 0.107996 and then scales using Standard scaler.

- Create an object of OneHotEncoder (to handle categorical features)

In [None]:
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.ensemble import BaggingClassifier
from sklearn.tree import DecisionTreeClassifier

In [None]:
numeric_features = ['Length', 'Diameter',	'Height',	'Whole weight',	'Shucked weight',	'Viscera weight',	'Shell weight']
categorical_features = ["Sex"]


In [None]:
numeric_transformer = Pipeline(
    steps=[("imputer", SimpleImputer(missing_values = 0, strategy="constant", fill_value = 0.107996)), ("scaler", StandardScaler())]
)


In [None]:
categorical_transformer = OneHotEncoder(handle_unknown="ignore")



#Task

- Create a ColumnTransformer to handle numerical and categorical features.



In [None]:
preprocessor = ColumnTransformer(
    transformers=[
        ("num", numeric_transformer, numeric_features),
        ("cat", categorical_transformer, categorical_features),
    ]
)

#Task

- Create a pipeline to first preprocess using the columntransformer and then BaggingClassifier (with default values)

In [None]:
# Append classifier to preprocessing pipeline.
# Now we have a full prediction pipeline.
clf = Pipeline(
    steps=[("preprocessor", preprocessor), ("classifier", BaggingClassifier(random_state = 0))]
)

In [None]:
?BaggingClassifier()

#Task

- Train the model on training data and print the score on test data.

In [None]:
clf.fit(X_train, y_train)
print("model score: %.3f" % clf.score(X_test, y_test))

model score: 0.238


#Task

- Now use cross_val_score on the training data with cv = 10, and print the accuracy on each fold. Also print the mean score.




In [None]:
from sklearn.model_selection import cross_val_score
acc = cross_val_score(estimator = clf, X = X_train, y = y_train, cv = 10)
print(type(acc))
print('Accuracy of each fold ', list(acc*100))
print("Accuracy: {:.2f} %".format(acc.mean()*100))



<class 'numpy.ndarray'>
Accuracy of each fold  [21.791044776119403, 23.652694610778443, 22.15568862275449, 21.856287425149702, 21.856287425149702, 24.251497005988025, 24.850299401197603, 25.149700598802394, 22.15568862275449, 23.652694610778443]
Accuracy: 23.14 %


#Task

- Let's use GridSearchCV on BaggingClassifier with

tuned_parameters = {'n_estimators': [10, 50, 100, 500],
                    'max_samples': [0.05, 0.1, 0.2, 0.5]
                     }

- Use scoring criteria to be recall.

- Before using GridSearchCV, preprocess the training data using the columnTransformer.

- Print the best parameters.

In [None]:
X_train_new = preprocessor.fit_transform(X_train)


In [None]:
from sklearn.model_selection import GridSearchCV
tuned_parameters = [{'n_estimators': [10, 50, 100, 500],
                    'max_samples': [0.05, 0.1, 0.2, 0.5]
                     }]
scores = ['recall']
for score in scores:

    print()
    print("Tuning hyperparameters for {score}")
    print()

    clf_CV = GridSearchCV(
        BaggingClassifier(random_state = 0), tuned_parameters
    )
    clf_CV.fit(X_train_new, y_train)

    print("Best parameters:")
    print()
    print(clf_CV.best_params_)
    print()



Tuning hyperparameters for {score}





Best parameters:

{'max_samples': 0.05, 'n_estimators': 500}



#Task

- Use the best parameters in the pipeline and print test score.

In [None]:
# Append classifier to preprocessing pipeline.
# Now we have a full prediction pipeline.
clf2 = Pipeline(
    steps=[("preprocessor", preprocessor), ("classifier", BaggingClassifier(max_samples =0.05, n_estimators = 100, random_state = 0))]
)

In [None]:
clf2.fit(X_train, y_train)
print("model score: %.3f" % clf2.score(X_test, y_test))

model score: 0.264




---



#Task

- In the pipeline, now use RandomForestClassifier.

- Print test score.


In [None]:
from sklearn.ensemble import RandomForestClassifier

# Append classifier to preprocessing pipeline.
# Now we have a full prediction pipeline.
clf_RFC = Pipeline(
    steps=[("preprocessor", preprocessor), ("classifier", RandomForestClassifier(random_state = 0))]
)

clf_RFC.fit(X_train, y_train)
print("model score: %.3f" % clf_RFC.score(X_test, y_test))

model score: 0.256


#Task

- Let's use GridSearchCV on RandomForestClassifier with

tuned_parameters = {'n_estimators': [50, 250, 500],
                    'max_depth' : [6,8,10]
                     }

- Before using GridSearchCV, preprocess the training data using the columnTransformer.

- Print the best parameters.

In [None]:
X_train_new = preprocessor.fit_transform(X_train)

In [None]:
from sklearn.model_selection import GridSearchCV
tuned_parameters = [{'n_estimators': [50, 250, 500],
                    'max_depth' : [6,8,10]
                     }]
scores = ['recall']
for score in scores:

    print()
    print("Tuning hyperparameters for {score}")
    print()

    clf_RFC_CV = GridSearchCV(
        RandomForestClassifier(random_state = 0), tuned_parameters
    )
    clf_RFC_CV.fit(X_train_new, y_train)

    print("Best parameters:")
    print()
    print(clf_RFC_CV.best_params_)
    print()



Tuning hyperparameters for {score}





Best parameters:

{'max_depth': 8, 'n_estimators': 250}



#Task

- Use best parameters in Pipeline now and print the score.

In [None]:
clf_RFC2 = Pipeline(
    steps=[("preprocessor", preprocessor), ("classifier", RandomForestClassifier(max_depth= 8, n_estimators = 250, random_state = 0))]
)

clf_RFC2.fit(X_train, y_train)
print(clf_RFC2.score(X_test, y_test))

0.2799043062200957


#Task

- In the pipeline, now use GradientBoostingClassifier.

- Print test score.



In [None]:
from sklearn.ensemble import GradientBoostingClassifier

# Append classifier to preprocessing pipeline.
# Now we have a full prediction pipeline.
clf_GBC = Pipeline(
    steps=[("preprocessor", preprocessor), ("classifier", GradientBoostingClassifier(random_state = 0))]
)

clf_GBC.fit(X_train, y_train)
print(clf_GBC.score(X_test, y_test))

0.2332535885167464


#Task

- Let's use GridSearchCV on GradientBoostingClassifier with

tuned_parameters = {"learning_rate": [0.01, 0.05, 0.1, 0.2],
    "max_depth":[3,5,8],}

- Before using GridSearchCV, preprocess the training data using the columnTransformer.

- Print the best parameters.

In [None]:
X_train_new = preprocessor.fit_transform(X_train)

In [None]:
from sklearn.model_selection import GridSearchCV
tuned_parameters = [{
    "learning_rate": [0.01, 0.05, 0.1, 0.2],
    "max_depth":[3,5,8],
}]


scores = ['recall']
for score in scores:

    print()
    print("Tuning hyperparameters for {score}")
    print()

    clf_GBC_CV = GridSearchCV(
        GradientBoostingClassifier(random_state = 0), tuned_parameters
    )
    clf_GBC_CV.fit(X_train_new, y_train)

    print("Best parameters:")
    print()
    print(clf_GBC_CV.best_params_)
    print()
    print("Grid scores:")



Tuning hyperparameters for {score}





Best parameters:

{'learning_rate': 0.01, 'max_depth': 3}

Grid scores:


#Task

- Use best parameters in Pipeline now and print the score.

In [None]:
clf_GBC2 = Pipeline(
    steps=[("preprocessor", preprocessor), ("classifier", GradientBoostingClassifier(learning_rate= 0.075, max_depth= 3, max_features='log2', n_estimators=10))]
)

clf_GBC2.fit(X_train, y_train)
print("model score: %.3f" % clf_GBC2.score(X_test, y_test))

model score: 0.258


#Task

- Use AdaBoostClassifier in the pipeline now. and print the test score.



---



In [None]:
from sklearn.ensemble import AdaBoostClassifier

# Append classifier to preprocessing pipeline.
# Now we have a full prediction pipeline.
clf_ABC = Pipeline(
    steps=[("preprocessor", preprocessor), ("classifier", AdaBoostClassifier(random_state = 0))]
)

clf_ABC.fit(X_train, y_train)
print(clf_ABC.score(X_test, y_test))

0.1937799043062201



#Task

- Let's use GridSearchCV on AdaBoostClassifier with

tuned_parameters = {'n_estimators': [50,150],    
    'learning_rate': [0.1,0.7]}

- Before using GridSearchCV, preprocess the training data using the columnTransformer.

- Print the best parameters.

In [None]:
X_train_new = preprocessor.fit_transform(X_train)

In [None]:
from sklearn.model_selection import GridSearchCV
tuned_parameters = [{
    'n_estimators': [50,150],
    'learning_rate': [0.1,0.7]
}]


scores = ['recall']
for score in scores:

    print()
    print("Tuning hyperparameters for {score}")
    print()

    clf_ABC_CV = GridSearchCV(
        AdaBoostClassifier(random_state = 0), tuned_parameters
    )
    clf_ABC_CV.fit(X_train_new, y_train)

    print("Best parameters:")
    print()
    print(clf_ABC_CV.best_params_)
    print()


In [None]:
clf_ABC2 = Pipeline(
    steps=[("preprocessor", preprocessor), ("classifier", AdaBoostClassifier(learning_rate= 0.7, n_estimators= 50, random_state = 0))]
)

clf_ABC2.fit(X_train, y_train)
print(clf_ABC2.score(X_test, y_test))

0.23923444976076555


#VotingClassifier

In [None]:
models = list()
models.append(('knn1', KNeighborsClassifier(n_neighbors=1)))
models.append(('knn3', KNeighborsClassifier(n_neighbors=3)))
models.append(('knn5', KNeighborsClassifier(n_neighbors=5)))
models.append(('knn7', KNeighborsClassifier(n_neighbors=7)))
models.append(('knn9', KNeighborsClassifier(n_neighbors=9)))


clf_VC = Pipeline(
    steps=[("preprocessor", preprocessor), ("classifier", VotingClassifier(estimators=models,voting='hard'))]
)

clf_VC.fit(X_train, y_train)
print("model score: %.3f" % clf_VC.score(X_test, y_test))


NameError: ignored

In [None]:
models = list()
models.append(('svm1', SVC(probability=True, kernel='poly', degree=1)))
models.append(('svm2', SVC(probability=True, kernel='poly', degree=2)))
models.append(('svm3', SVC(probability=True, kernel='poly', degree=3)))
models.append(('svm4', SVC(probability=True, kernel='poly', degree=4)))
models.append(('svm5', SVC(probability=True, kernel='poly', degree=5)))


clf_VC = Pipeline(
    steps=[("preprocessor", preprocessor), ("classifier", VotingClassifier(estimators=models,voting='hard'))]
)

clf_VC.fit(X_train, y_train)
print("model score: %.3f" % clf_VC.score(X_test, y_test))


In [None]:
models = list()
models.append(('cart1', DecisionTreeClassifier(max_depth=1)))
models.append(('cart2', DecisionTreeClassifier(max_depth=2)))
models.append(('cart3', DecisionTreeClassifier(max_depth=3)))
models.append(('cart4', DecisionTreeClassifier(max_depth=4)))
models.append(('cart5', DecisionTreeClassifier(max_depth=5)))

clf_VC = Pipeline(
    steps=[("preprocessor", preprocessor), ("classifier", VotingClassifier(estimators=models,voting='hard'))]
)

clf_VC.fit(X_train, y_train)
print("model score: %.3f" % clf_VC.score(X_test, y_test))


In [None]:
models = list()
models.append(('lr1', LogisticRegression(penalty = 'l1', solver='liblinear')))
models.append(('lr2', LogisticRegression(penalty = 'l2', solver='liblinear')))
models.append(('lr3', LogisticRegression(penalty = 'elasticnet', solver='saga', l1_ratio=0.5)))
models.append(('lr4', LogisticRegression(penalty = 'none', solver='saga')))

clf_VC = Pipeline(
    steps=[("preprocessor", preprocessor), ("classifier", VotingClassifier(estimators=models,voting='hard'))]
)

clf_VC.fit(X_train, y_train)
print("model score: %.3f" % clf_VC.score(X_test, y_test))



In [None]:
from sklearn.ensemble import VotingClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier

lr = LogisticRegression()
dt = DecisionTreeClassifier()
svm= SVC(probability=True)
knn= KNeighborsClassifier()

# Append classifier to preprocessing pipeline.
# Now we have a full prediction pipeline.
clf_VC = Pipeline(
    steps=[("preprocessor", preprocessor), ("classifier", VotingClassifier(estimators=[('lr', lr), ('dt', dt), ('svc', svm), ('knn',knn)],voting='hard'))]
)

clf_VC.fit(X_train, y_train)
print("model score: %.3f" % clf_VC.score(X_test, y_test))




---



---

