#### Import the libraries

In [1]:
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import pandas as pd

#### Read the dataset

In [2]:
df = pd.read_csv('E:/Imarticus_CF/Datasets/HeartDisease.csv')
df.head()

FileNotFoundError: [Errno 2] No such file or directory: 'E:/Imarticus_CF/Datasets/HeartDisease.csv'

In [None]:
pd.set_option('display.max_columns',None)
pd.set_option('display.max_rows',None) 

In [None]:
df.shape
# rows = 303,cols=14

### Data Preprocessing

#### 1) Handling Null values

In [None]:
df.isnull().sum()

#### 2) Handling Duplicates

In [None]:
df.duplicated().sum()

In [None]:
df.drop_duplicates(inplace=True)
df.duplicated().sum()

#### 3) Check data types

In [None]:
df.dtypes

#### 4) Target Variable

In [None]:
df['target'].value_counts()

### EDA

#### Countplot for Target

In [None]:
sns.countplot(x=df['target'])
plt.title('Countplot for Target')
plt.show()

In [None]:
df.columns

In [None]:
df['gender'].value_counts()

In [None]:
for i in df.columns:
    print(f'Feature {i}')
    print(df[i].nunique())

In [None]:
df.columns

In [None]:
cont_feat = ['age','rest_bps','cholestrol','thalach','old_peak']
# cat_feat = ['gender','chest_pain','fasting_blood_sugar','rest_ecg','exer_angina',
#            'slope', 'ca', 'thalassemia']

In [None]:
cat_feat = [i for i in df.columns if i not in cont_feat]
print(cat_feat)

In [None]:
for i in cat_feat:
    sns.countplot(x=df['target'],hue=df[i])
    plt.title(f'Count of target wrt {i}')
    plt.show()

#### Inference
1) Female patients are more affected by heart disease.<br>
2) Chest Pain intensity 2 patients are more affected by heart disease.
etc

### Correlation

In [None]:
corr = df.corr()

In [None]:
plt.figure(figsize=(12,12))
sns.heatmap(corr[abs(corr)>0.7],annot=True,cmap='RdBu')
plt.show()

#### Infernece
1) Features are not highly correlated.

#### Outlier Treatment

In [None]:
print(cont_feat)

In [None]:
for i in cont_feat:
    sns.boxplot(x=df[i])
    plt.title(f'Boxplot for {i}')
    plt.show()

In [None]:
df[cont_feat].describe(percentiles=[0.95,0.97,0.98,0.99]).T

In [None]:
print(df[df['old_peak']>4.2].shape)
print(df[df['rest_bps']>170.00].shape)
print(df[df['cholestrol']>353.98].shape)

In [None]:
# def outlier_treatment(x):
#     x = x.clip(upper=x.quantile(0.99))
#     return x

In [None]:
# x = int(input())
# res = np.where(x%2==0,'Even','Odd')
# print(res)

In [None]:
df['old_peak'] = np.where(df['old_peak']>4.2,4.2,df['old_peak'])
df['rest_bps'] = np.where(df['rest_bps']>170.00,170.00,df['rest_bps'])
df['cholestrol'] = np.where(df['cholestrol']>353.98,353.98,df['cholestrol'])

In [None]:
for i in ['old_peak','rest_bps','cholestrol']:
    sns.boxplot(x=df[i])
    plt.title(f'{i}')
    plt.show()

#### Select x and y

In [None]:
x = df.drop('target',axis=1)
y = df['target']
print(type(x),type(y))
print(x.shape,y.shape)

#### Split data into train and test

In [None]:
from sklearn.model_selection import train_test_split

In [None]:
x_train,x_test,y_train,y_test = train_test_split(x,y,test_size=0.30,random_state=42)
print(x_train.shape)
print(x_test.shape)
print(y_train.shape)
print(y_test.shape)

#### Creating Function Evaluate model performance

In [None]:
from sklearn.metrics import confusion_matrix,classification_report,accuracy_score

In [None]:
# def eval_model(ytest,ypred):
#     cm = confusion_matrix(ytest,ypred)
#     print(cm)
#     print('Acc Score',accuracy_score(ytest,ypred))
#     print(classification_report(ytest,ypred))
    
def mscore(model):
    print('Train Score',model.score(x_train,y_train))
    print('Test Score',model.score(x_test,y_test))

In [None]:
def eval_model(model,x_train,x_test,y_train,y_test,model_name):
    model.fit(x_train,y_train)
    train_scr = model.score(x_train,y_train)
    test_scr = model.score(x_test,y_test)
    ypred = model.predict(x_test)
    cm = confusion_matrix(y_test,ypred)
    print(cm)
    print('Acc Score',accuracy_score(y_test,ypred))
    print(classification_report(y_test,ypred))
    res = pd.DataFrame({'Train_Score':train_scr,'Test_Score':test_scr},index=[model_name])
    return res

#### Model Building

In [None]:
from sklearn.tree import DecisionTreeClassifier, plot_tree,export_text

In [None]:
dt1 = DecisionTreeClassifier(criterion='gini')
dt1_res = eval_model(dt1,x_train,x_test,y_train,y_test,'DT1(gini)')
dt1_res

In [None]:
dt1_res

In [None]:
dt2 = DecisionTreeClassifier(criterion='gini',max_depth=7,min_samples_split=15)
dt2_res = eval_model(dt2,x_train,x_test,y_train,y_test,'DT2(gini,md,mss)')

In [None]:
dt2_res

#### Cross Validation for Max_Depth

In [None]:
from sklearn.model_selection import cross_val_score, GridSearchCV, RandomizedSearchCV, KFold

In [None]:
kf_cv = KFold(n_splits=5,shuffle=True)
dt = DecisionTreeClassifier(criterion='gini')
max_depth_range = list(range(7,13))  # 7,8,9,10,11,12
for i in max_depth_range:
    score = cross_val_score(dt, x,y,cv=kf_cv,scoring='accuracy')
    print(f'Score for max_depth {i} is {score}')
    print(f'Mean Score max_depth {i} is {score.mean()}')

#### Cross Validation for Min_Samples_Split

In [None]:
kf_cv = KFold(n_splits=5,shuffle=True)
dt = DecisionTreeClassifier(criterion='gini')
mss_range = list(range(8,20,2))  # 8,10,12,....,18
for i in mss_range:
    score = cross_val_score(dt, x,y,cv=kf_cv,scoring='accuracy')
    print(f'Score for min_samples_split {i} is {score}')
    print(f'Mean Score min_samples_split {i} is {score.mean()}')

### Hyperparameter Tuning Techniques

#### GridSearchCV
1) Searches for all possible permutations and combination of hyperparameters and then generates the best hyperparameters.<br>
2) High time complexity<br>

#### RandomizedSarchCV
1) Searches for some random combinations of hyperparameters and then generates the best parameters from amongst the randomly chosen combinations.<br>
2) Low time complexity<br>


In [None]:
hparams = {'criterion':['gini','entropy'],
          'max_depth':[8,9,10,11,12,14],
          'min_samples_split':[8,10,12,14,15,16,20]}

In [None]:
dt_base = DecisionTreeClassifier()
gscv = GridSearchCV(dt_base,param_grid=hparams,scoring='accuracy')  # cv=5
gscv.fit(x_train,y_train)

In [None]:
print(gscv.best_params_)

In [None]:
gscv1_res = pd.DataFrame(gscv.cv_results_)
gscv1_res.head()

In [None]:
gscv1_res.shape

In [None]:
gscv1_res.columns

In [None]:
comp1 = gscv1_res[['params', 'mean_test_score','rank_test_score']]
comp1.sort_values('rank_test_score',ascending=True,inplace=True)
comp1

In [None]:
print(gscv.best_params_)
print(gscv.best_estimator_)
print(gscv.best_index_)
print(gscv.best_score_)

In [None]:
print(gscv.score(x_train,y_train))
print(gscv.score(x_test,y_test))

#### args and kwargs

In [None]:
# args - *, used with list or tuple
# kwargs - **, used with dict

In [None]:
def sum_n(*x):
    tot = 0
    for i in x:   # destructuring
        tot += i
    print(tot)
    
sum_n(2,3)
sum_n()
sum_n(2,3,5,7,8,9,12,14,13)
sum_n(2,3,5,7)

In [None]:
w1 = [5,6,7,8]
w2 = [10,20,40,w1,50]
w3 = [10,20,40,*w1,50]
print(w2)
print(w3)

In [None]:
def f1(**n):
    print(n)
    
f1(name='Ankit',age=22)
f1(age=22,city='Delhi',name='Ankit')

In [None]:
def prod(a,b):
    print(a*b)

prod(5,7)

In [None]:
w1 = {'a1':10,'a2':20}
w2 = {'k1':5,'k2':7,'k3':w1}
w3 = {'k1':5,'k2':7,**w1}
print(w2)
print(w3)

In [None]:
#### ** kwargs - keyword args

In [None]:
final_dt = DecisionTreeClassifier(**gscv.best_params_)
final_res = eval_model(dt1,x_train,x_test,y_train,y_test,'GS_Best_params')
final_res

#### Randomized Search CV

In [None]:
hparams = {'criterion':['gini','entropy'],
          'max_depth':[8,9,10,11,12,14],
          'min_samples_split':[8,10,12,14,15,16,20]}

In [None]:
dt_base1 = DecisionTreeClassifier()
rscv = RandomizedSearchCV(dt_base1,param_distributions=hparams,scoring='accuracy',n_iter=20)  # cv=5
rscv.fit(x_train,y_train)

In [None]:
rscv_res = pd.DataFrame(rscv.cv_results_)
rscv_res.shape

In [None]:
rscv_res[['params','mean_test_score','rank_test_score']]

In [None]:
print(rscv.best_params_)
print(rscv.best_estimator_)
print(rscv.best_index_)
print(rscv.best_score_)

In [None]:
dt_res = pd.concat([dt1_res,dt2_res,final_res])
dt_res