## **Exercise 16.06: Guided Exercise**
### Grid search and Cross-validation with ML Pipelines

### Importing modules

In [None]:
import pandas as pd

###Loading data

In [None]:

filename = 'https://raw.githubusercontent.com/OsamaAkhlaq/DS_Book/main/Chapter%2016/pima-indians-diabetes.csv'
# Loading the data using pandas

diabData = pd.read_csv(filename,sep=",",header = None,na_values = "?")
diabData.head()

### Finding number of null values in the data set

In [None]:

diabData.isnull().sum()

### Dropping all the rows with na values

In [None]:

newdiabdata = diabData.dropna(axis = 0)

### Printing the shape of earlier data set and new data set

In [None]:

print(diabData.shape)
print(diabData.shape)

### Seperating X and y variables

In [None]:

X = diabData.loc[:,0:8]
print(X.shape)
y = diabData.loc[:,8]
print(y.shape)

### Splitting the data into train and test sets

In [None]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=123)

print(X_train.shape)
print(X_test.shape)

## Pipe line for Dummy creation


In [None]:

# Importing the necessary packages
from sklearn.pipeline import Pipeline

In [None]:
from sklearn.preprocessing import StandardScaler, OneHotEncoder
# Pipeline for transforming categorical variables
catTransformer = Pipeline(steps=[('onehot', OneHotEncoder(handle_unknown='ignore'))])
# Pipeline for scaling numerical variables
numTransformer = Pipeline(steps=[('scaler', StandardScaler())])

### Printing dtypes for X

In [None]:

X.dtypes

### Selecting numerical features

In [None]:

numFeatures = X.select_dtypes(include=['int64', 'float64']).columns
numFeatures

### Selecting Categorical features

In [None]:

catFeatures = X.select_dtypes(include=['object']).columns
catFeatures

### Creating the preprocessing engine

In [None]:
from sklearn.compose import ColumnTransformer
preprocessor = ColumnTransformer(
    transformers=[
        ('numeric', numTransformer, numFeatures),
        ('categoric', catTransformer, catFeatures)])

### Importing necessary libraries

In [None]:

from sklearn.decomposition import PCA
from sklearn.ensemble import AdaBoostClassifier

### Creating a pipeline with AdaBoostClassifier

In [None]:

pipe = Pipeline(steps=[('preprocessor', preprocessor),
                      ('dimred', PCA()),
                           ('classifier',AdaBoostClassifier(random_state=123))])

### Defining the parameters as a dictionary

In [None]:

param_grid = {'dimred__n_components':[5,7,9],"classifier__n_estimators": [50, 100,200],"classifier__learning_rate":[0.7,0.6,1.0]}


In [None]:
from sklearn.model_selection import GridSearchCV
# Fitting the grid search
estimator = GridSearchCV(pipe, cv=10, param_grid=param_grid)

### Fitting the estimator on the training set

In [None]:
estimator.fit(X_train,y_train)

### Printing the best score and best parameters

In [None]:

print("Best: %f using %s" % (estimator.best_score_, estimator.best_params_))


### Predicting with the best estimator

In [None]:
pred = estimator.predict(X_test)

### Printing the classification report

In [None]:
from sklearn.metrics import classification_report

print(classification_report(pred,y_test))