## **Exercise 16.04: Guided Exercise**
###  Modeling and predictions using ML pipelines

### Importing modules

In [None]:
import pandas as pd


### Loading data

In [None]:
filename = 'https://raw.githubusercontent.com/OsamaAkhlaq/DS_Book/main/Chapter%2016/pima-indians-diabetes.csv'
# Loading the data using pandas

diabData = pd.read_csv(filename,sep=",",header = None,na_values = "?")
diabData.head()

### Finding number of null values in the data set

In [None]:

diabData.isnull().sum()

### Dropping all the rows with na values

In [None]:
newdiabdata = diabData.dropna(axis = 0)

### Printing the shape of earlier data set and new data set

In [None]:
print(diabData.shape)
print(newdiabdata.shape)

### Seperating X and y variables

In [None]:
X = newdiabdata.loc[:,0:8]
print(X.shape)
y = newdiabdata.loc[:,8]
print(y.shape)

### Splitting the data into train and test sets

In [None]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=123)

print(X_train.shape)
print(X_test.shape)

## Pipe line for Dummy creation


In [None]:

# Importing the necessary packages
from sklearn.pipeline import Pipeline

In [None]:
from sklearn.preprocessing import StandardScaler, OneHotEncoder
# Pipeline for transforming categorical variables
catTransformer = Pipeline(steps=[('onehot', OneHotEncoder(handle_unknown='ignore'))])
# Pipeline for scaling numerical variables
numTransformer = Pipeline(steps=[('scaler', StandardScaler())])

### Printing dtypes for X

In [None]:

X.dtypes

### Selecting numerical features

In [None]:

numFeatures = X.select_dtypes(include=['int64', 'float64']).columns

numFeatures

### Selecting Categorical features

In [None]:
catFeatures = X.select_dtypes(include=['object']).columns
catFeatures

### Creating the preprocessing engine

In [None]:

from sklearn.compose import ColumnTransformer
preprocessor = ColumnTransformer(
    transformers=[
        ('numeric', numTransformer, numFeatures),
        ('categoric', catTransformer, catFeatures)])

### Transforming the Training data

In [None]:
Xtran_train = pd.DataFrame(preprocessor.fit_transform(X_train))
print(Xtran_train.shape)
Xtran_train.head()

### Transforming Test data

In [None]:
Xtran_test = pd.DataFrame(preprocessor.transform(X_test))
print(Xtran_test.shape)
Xtran_test.head()

## Dimensionality reduction after processing with Pipeline

In [None]:
# Importing PCA library
from sklearn.decomposition import PCA
from sklearn.linear_model import LogisticRegression

### Creating an estimator with both preprocessor and dimensionality reduction

In [None]:
estimator = Pipeline(steps=[('preprocessor', preprocessor),
                      ('dimred', PCA(5)),
                      ('clf',LogisticRegression(random_state=123))])

In [None]:
estimator.fit(X_train,y_train)

### Creating the score on the test set

In [None]:

estimator.score(X_test, y_test)

### Generating the predictions on test set

In [None]:

pred = estimator.predict(X_test)

### Printing the classification report

In [None]:
from sklearn.metrics import classification_report

print(classification_report(pred,y_test))

### Generating confusion matrix

In [None]:

from sklearn.metrics import confusion_matrix

confusionMatrix = confusion_matrix(y_test, pred)
print(confusionMatrix)