## **Activity 16.01: Guided Exercise**
### Perform all the steps learned through Chapter 16, and apply on the Chronic Kidney disease Dataset.Extensive data cleaning is done beforehand. Start after that:

### Importing modules

In [139]:
import pandas as pd

###Loading data

In [None]:

filename = 'https://raw.githubusercontent.com/OsamaAkhlaq/DS_Book/main/Chapter%2016/kidney_disease.csv'
# Loading the data using pandas

Data = pd.read_csv(filename)
Data.head()

### Dropping id column

In [114]:

Data.drop('id', axis = 1, inplace = True)

In [115]:
Data.columns = ['age', 'blood_pressure', 'specific_gravity', 'albumin', 'sugar', 'red_blood_cells', 'pus_cell',
              'pus_cell_clumps', 'bacteria', 'blood_glucose_random', 'blood_urea', 'serum_creatinine', 'sodium',
              'potassium', 'haemoglobin', 'packed_cell_volume', 'white_blood_cell_count', 'red_blood_cell_count',
              'hypertension', 'diabetes_mellitus', 'coronary_artery_disease', 'appetite', 'peda_edema',
              'aanemia', 'class']

#### Convert necessary columns to numerical type 

In [116]:
Data['packed_cell_volume'] = pd.to_numeric(Data['packed_cell_volume'], errors='coerce')
Data['white_blood_cell_count'] = pd.to_numeric(Data['white_blood_cell_count'], errors='coerce')
Data['red_blood_cell_count'] = pd.to_numeric(Data['red_blood_cell_count'], errors='coerce')

### Make a replace for incorect value 

In [117]:
Data['diabetes_mellitus'].replace(to_replace = {'\tno':'no','\tyes':'yes',' yes':'yes'},inplace=True)

Data['coronary_artery_disease'] = Data['coronary_artery_disease'].replace(to_replace = '\tno', value='no')

Data['class'] = Data['class'].replace(to_replace = {'ckd\t': 'ckd', 'notckd': 'not ckd'})

### Give class either 0 or 1

In [118]:
Data['class'] = Data['class'].map({'ckd': 0, 'not ckd': 1})
Data['class'] = pd.to_numeric(Data['class'], errors='coerce')

### Finding number of null values in the data set

In [None]:
Data.isnull().sum()

In [None]:
cols = ['diabetes_mellitus', 'coronary_artery_disease', 'class']

for col in cols:
    print(f"{col} has {Data[col].unique()} values\n")

### Extracting categorical and numerical columns

In [121]:

cat_cols = [col for col in Data.columns if Data[col].dtype == 'object']
num_cols = [col for col in Data.columns if Data[col].dtype != 'object']

### Look at unique values in categorical columns

In [None]:

for col in cat_cols:
    print(f"{col} has {Data[col].unique()} values\n")

### Fillna

In [123]:
def random_value_imputation(feature):
    random_sample = Data[feature].dropna().sample(Data[feature].isna().sum())
    random_sample.index = Data[Data[feature].isnull()].index
    Data.loc[Data[feature].isnull(), feature] = random_sample
    
def impute_mode(feature):
    mode = Data[feature].mode()[0]
    Data[feature] = Data[feature].fillna(mode)

In [None]:
for col in num_cols:
    random_value_imputation(col)
Data[num_cols].isnull().sum()

### Fill "red_blood_cells" and "pus_cell" using random sampling method and rest of cat_cols using mode imputation

In [None]:

random_value_imputation('red_blood_cells')
random_value_imputation('pus_cell')

for col in cat_cols:
    impute_mode(col)
Data[cat_cols].isnull().sum()

### Dropping all the rows with na values

In [126]:

newdata = Data.dropna(axis = 0)

### Printing the shape of earlier data set and new data set

In [None]:

print(Data.shape)
print(newdata.shape)

### Seperating X and y variables

In [None]:

X = Data.loc[:,'age':'aanemia']
print(X.shape)
y =Data.loc[:,'class':]
print(y.shape)

### Splitting the data into train and test sets

In [None]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=123)

print(X_train.shape)
print(X_test.shape)

## Pipe line for Dummy creation


In [129]:

# Importing the necessary packages
from sklearn.pipeline import Pipeline

In [130]:
from sklearn.preprocessing import StandardScaler, OneHotEncoder
# Pipeline for transforming categorical variables
catTransformer = Pipeline(steps=[('onehot', OneHotEncoder(handle_unknown='ignore'))])
# Pipeline for scaling numerical variables
numTransformer = Pipeline(steps=[('scaler', StandardScaler())])

### Printing dtypes for X

In [None]:

X.dtypes

### Selecting numerical features

In [None]:

numFeatures = X.select_dtypes(include=['int64', 'float64']).columns
numFeatures

### Selecting Categorical features

In [None]:

catFeatures = X.select_dtypes(include=['object']).columns
catFeatures

### Creating the preprocessing engine

In [134]:
from sklearn.compose import ColumnTransformer
preprocessor = ColumnTransformer(
    transformers=[
        ('numeric', numTransformer, numFeatures),
        ('categoric', catTransformer, catFeatures)])

### Importing necessary libraries

In [135]:

from sklearn.decomposition import PCA
from sklearn.ensemble import AdaBoostClassifier

### Creating a pipeline with AdaBoostClassifier

In [136]:

pipe = Pipeline(steps=[('preprocessor', preprocessor),
                      ('dimred', PCA()),
                           ('classifier',AdaBoostClassifier(random_state=123))])

### Defining the parameters as a dictionary

In [137]:

param_grid = {'dimred__n_components':[4,6,10],"classifier__n_estimators": [50, 100,200],"classifier__learning_rate":[0.7,0.6,1.0]}


In [138]:
from sklearn.model_selection import GridSearchCV
# Fitting the grid search
estimator = GridSearchCV(pipe, cv=10, param_grid=param_grid)

### Fitting the estimator on the training set

In [None]:
estimator.fit(X_train,y_train)

### Printing the best score and best parameters

In [None]:

print("Best: %f using %s" % (estimator.best_score_, estimator.best_params_))


### Predicting with the best estimator

In [86]:
pred = estimator.predict(X_test)

### Printing the classification report

In [None]:
from sklearn.metrics import classification_report

print(classification_report(pred,y_test))