## Import required libraries and dependancies

In [2]:
## For some arithmetic and Matrix Operations
import numpy as np

## Dataframe Manipulation
import pandas as pd

## For Visualization
import matplotlib.pyplot as plt

## For Visualization too
import seaborn as sns

## Creating Pipeline
from sklearn.pipeline import Pipeline
from sklearn.pipeline import make_pipeline


## Creating a function transformer
from sklearn.preprocessing import FunctionTransformer

## For Column Transformer
from sklearn.compose import ColumnTransformer
from sklearn.compose import make_column_selector

## For preprocessing
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import OrdinalEncoder

## For missing values
from sklearn.impute import SimpleImputer

## Getting the recall score on our train set
from sklearn.metrics import recall_score

## Getting the accuracy score on train set
from sklearn.metrics import accuracy_score

## Getting the classification report from our train set
from sklearn.metrics import classification_report

## Cross validation
from sklearn.model_selection import cross_val_score
## Gridsearch CV
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import RandomizedSearchCV

## Imbalanced pipeline and SMOTE
from imblearn.pipeline import Pipeline, make_pipeline
from imblearn.over_sampling import SMOTE
from imblearn.under_sampling import RandomUnderSampler
from imblearn.combine import SMOTETomek
from imblearn.under_sampling import TomekLinks

## Models
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB
# from xgboost import XGBClassifier
from IPython.display import HTML,display

import warnings
warnings.filterwarnings("ignore")

## Some important functions

In [3]:
## printing the shape and head
def head(df,shape_only=False):
    print(df.shape)

    if shape_only:
        return
    else:
        return df.head()

## for EDA of categorical values
def eda_bivariate_categorical(df,column,target):

    fig,ax = plt.subplots(figsize = (9,8))

    color = 'Set2'

    palette_color = sns.color_palette(color)

    ax = sns.countplot(x = column, data=df, hue=target,palette=color,order = df[column].value_counts().index)
    ax.set_ylabel('Count')

    offset = df[column].value_counts().max() * 0.005

    list_bars = df.groupby([column,target])[column].agg(['count']).unstack().fillna(0).values

    patches = ax.patches
    bars_pos = 0

    for i in range(df[target].nunique()):
        for j in range(df[column].nunique()):
            list_bars_col = list_bars[j] 
            total_sum = list_bars_col.sum()
            value = list_bars_col[i]

            percentage = value / total_sum

            if percentage == 0:
                bars_pos += 1
                continue
            else:
                x = patches[bars_pos].get_x() + patches[j].get_width()/2
                y = patches[bars_pos].get_height() + offset
                ax.annotate('{:.1f}%'.format(percentage*100), (x, y), ha='center')
                bars_pos += 1
    plt.show()

## Function that plots numerical variables into histogram and violin plot
def eda_bivariate_numerical(data,column,target,color,
                    figsize=(12,6),
                    # save=True,
                    val=0,
                    target_type = 'Numerical'):

    fig, axes = plt.subplots(1, 2, figsize=figsize)
    cmap = sns.color_palette(color)
    val = val

    for i in range(1):
        for j in range(2):
            if j==0:
                    sns.histplot(data = data,x=data[column],hue=target,
                                bins=50,kde=True,palette=color,ax=axes[j])
                    axes[j].set(xlabel=None)
                    axes[j].grid(False)
            elif j==1:
                sns.boxplot(data = data,x=data[column],y = target, ax=axes[j], palette=color,orient='h',
                )
                axes[j].set(xlabel=None)
                axes[j].grid(False)
                val += 1
                plt.tight_layout()
            if target_type == 'Numerical':
                plt.suptitle(column)
            else:
                plt.suptitle(f'{column} vs. {target}')
    plt.show()
    
    # path = 'Figures\\Numerical\\'
    # if save:
    #     plt.savefig(f"{path}{column}.pdf",dpi=1000)


## print text to see the font
def print_text(text):
    fig, ax = plt.subplots(figsize=(6, 1), facecolor="#eefade")
    ax.text(0.5, 0.5, text, ha='center', va='center', size=40)
    ax.axis("off")
    plt.show()

## Import data

In [4]:
path='C:\\Users\\91772\\DemoProjects\\CardioVascular\\'
data=pd.read_csv(path+'CVD_cleaned.csv')
head(data)

(308854, 19)


Unnamed: 0,General_Health,Checkup,Exercise,Heart_Disease,Skin_Cancer,Other_Cancer,Depression,Diabetes,Arthritis,Sex,Age_Category,Height_(cm),Weight_(kg),BMI,Smoking_History,Alcohol_Consumption,Fruit_Consumption,Green_Vegetables_Consumption,FriedPotato_Consumption
0,Poor,Within the past 2 years,No,No,No,No,No,No,Yes,Female,70-74,150.0,32.66,14.54,Yes,0.0,30.0,16.0,12.0
1,Very Good,Within the past year,No,Yes,No,No,No,Yes,No,Female,70-74,165.0,77.11,28.29,No,0.0,30.0,0.0,4.0
2,Very Good,Within the past year,Yes,No,No,No,No,Yes,No,Female,60-64,163.0,88.45,33.47,No,4.0,12.0,3.0,16.0
3,Poor,Within the past year,Yes,Yes,No,No,No,Yes,No,Male,75-79,180.0,93.44,28.73,No,0.0,30.0,30.0,8.0
4,Good,Within the past year,No,No,No,No,No,No,No,Male,80+,191.0,88.45,24.37,Yes,0.0,8.0,4.0,0.0


## EDA

### Univariate 

In [5]:
import pandas_profiling as pp
profile = pp.ProfileReport(data)
profile.to_file("output.html")

Summarize dataset:   0%|          | 0/5 [00:00<?, ?it/s]

Generate report structure:   0%|          | 0/1 [00:00<?, ?it/s]

Render HTML:   0%|          | 0/1 [00:00<?, ?it/s]

Export report to file:   0%|          | 0/1 [00:00<?, ?it/s]

### Bivariate

In [6]:
import sweetviz
report=sweetviz.analyze([data,'Train'],target_feat='Heart_Disease')
report.show_html('Report.html')

                                             |                                             | [  0%]   00:00 ->…

Report Report.html was generated! NOTEBOOK/COLAB USERS: the web browser MAY not pop up, regardless, the report IS saved in your notebook/colab files.


### categorical and numerical variables

In [7]:
Target=pd.DataFrame()
Target=data['Heart_Disease'] #Target_variable
X=data['Heart_Disease']   
data.drop(columns={'Heart_Disease'},axis=1,inplace=True)
cat_var=data.select_dtypes(include='object')
num_var=data.select_dtypes(exclude='object')
# num_var1=num_var.columns.to_list()
# cat_var1=cat_var.columns.to_list()
# cols=data1.columns.to_list()

### Muticollinearity-- VIF

In [8]:
columns=['Weight_(kg)','Height_(cm)']
data.drop(columns=columns,axis=1,inplace=True)
num_var.drop(columns=columns,axis=1,inplace=True)

In [9]:
from statsmodels.stats.outliers_influence import variance_inflation_factor
vif_data = pd.DataFrame()
vif_data["feature"] = num_var.columns
  
# calculating VIF for each feature
vif_data["VIF"] = [variance_inflation_factor(num_var.values, i)
                          for i in range(len(num_var.columns))]
vif_data

Unnamed: 0,feature,VIF
0,BMI,3.237544
1,Alcohol_Consumption,1.345132
2,Fruit_Consumption,2.473787
3,Green_Vegetables_Consumption,2.133052
4,FriedPotato_Consumption,1.528777


## Skewness

In [10]:
num_var1=num_var.columns.to_list()
def skew_def(x):
    if x> -0.5 and x< 0.5:
        return 'Normally distributed'
    if x> -0.1 and x<-0.5:
        return 'slightly negative skewed'
    if x< -0.1:
        return 'extremely negative skewed'
    if x > 0.5 and x< 1:
        return 'slightly positive skewed'
    else:
        return 'extremely positive skewed'
report=pd.DataFrame()
report['skew']=data[num_var1].skew()
report['skew_flag']=report['skew'].apply(lambda x: skew_def(x))
report

Unnamed: 0,skew,skew_flag
BMI,1.376619,extremely positive skewed
Alcohol_Consumption,1.885622,extremely positive skewed
Fruit_Consumption,1.248428,extremely positive skewed
Green_Vegetables_Consumption,2.415608,extremely positive skewed
FriedPotato_Consumption,4.91235,extremely positive skewed


In [14]:
import seaborn as sns
for i in num_var1:
    # Plotting a histogram with KDE
    sns.histplot(data[i], kde=True)
    plt.xlabel(i)
    plt.ylabel('Frequency')
    plt.title('Distribution visuals')
    plt.show()
    plt.boxplot(data[i],vert=0) # show plot
    print('Outliers visualization for',i)
    plt.show()
    print("@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@")

Outliers visualization for BMI
@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@
Outliers visualization for Alcohol_Consumption
@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@
Outliers visualization for Fruit_Consumption
@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@
Outliers visualization for Green_Vegetables_Consumption
@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@
Outliers visualization for FriedPotato_Consumption
@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@


In [13]:
data_copy=data.copy()

## Preprocessing

In [15]:
X=X.map({'No':0,'Yes':1})
X.value_counts(normalize=True)       #imbalance dataset

0    0.91915
1    0.08085
Name: Heart_Disease, dtype: float64

In [16]:
data1=pd.concat([data,Target],axis=1)
data1['Heart_Disease']=data1['Heart_Disease'].map({'No':0,'Yes':1})

In [17]:
data1.head()

Unnamed: 0,General_Health,Checkup,Exercise,Skin_Cancer,Other_Cancer,Depression,Diabetes,Arthritis,Sex,Age_Category,BMI,Smoking_History,Alcohol_Consumption,Fruit_Consumption,Green_Vegetables_Consumption,FriedPotato_Consumption,Heart_Disease
0,Poor,Within the past 2 years,No,No,No,No,No,Yes,Female,70-74,14.54,Yes,0.0,30.0,16.0,12.0,0
1,Very Good,Within the past year,No,No,No,No,Yes,No,Female,70-74,28.29,No,0.0,30.0,0.0,4.0,1
2,Very Good,Within the past year,Yes,No,No,No,Yes,No,Female,60-64,33.47,No,4.0,12.0,3.0,16.0,0
3,Poor,Within the past year,Yes,No,No,No,Yes,No,Male,75-79,28.73,No,0.0,30.0,30.0,8.0,1
4,Good,Within the past year,No,No,No,No,No,No,Male,80+,24.37,Yes,0.0,8.0,4.0,0.0,0


In [18]:
num_var1=num_var.columns.to_list()
cat_var1=cat_var.columns.to_list()
cols=data1.columns.to_list()

## Feature selection using WoE and IV

In [19]:
import pandas as pd
import numpy as np

def data_vars(df, target):
    lst = []
    cols = df.columns

    for var in cols:
        if var != target:
            data = data1[[General_Health, Heart_Disease]].copy()
            data['Variable'] = var
            data = data.groupby(['Variable', var, target]).size().unstack().fillna(0)
            data['% of Total'] = data.sum(axis=1) / data.sum().sum()
            data['% of Good'] = data[0] / data[0].sum()
            data['% of Bad'] = data[1] / data[1].sum()
            data['WoE'] = np.log(data['% of Good'] / data['% of Bad'])
            data['IV'] = (data['% of Good'] - data['% of Bad']) * data['WoE']
            data = data.replace([np.inf, -np.inf], 0)
            data['IV'] = data['IV'].sum()
            lst.append(data['IV'].values[0])

    iv_df = pd.DataFrame({'Variable': cols.drop(target), 'IV': lst})
    iv_df = iv_df.sort_values('IV', ascending=False).reset_index(drop=True)
    return iv_df

In [20]:
def iv_woe(data, target, bins=10, show_woe=False):
    
    #Empty Dataframe
    newDF,woeDF = pd.DataFrame(), pd.DataFrame()
    
    #Extract Column Names
    cols = data.columns
    
    #Run WOE and IV on all the independent variables
    for ivars in cols[~cols.isin([target])]:
        if (data[ivars].dtype.kind in 'bifc') and (len(np.unique(data[ivars]))>10):
            binned_x = pd.qcut(data[ivars], bins,  duplicates='drop')
            d0 = pd.DataFrame({'x': binned_x, 'y': data[target]})
        else:
            d0 = pd.DataFrame({'x': data[ivars], 'y': data[target]})

        
        # Calculate the number of events in each group (bin)
        d = d0.groupby("x", as_index=False).agg({"y": ["count", "sum"]})
        d.columns = ['Cutoff', 'N', 'Events']
        
        # Calculate % of events in each group.
        d['% of Events'] = np.maximum(d['Events'], 0.5) / d['Events'].sum()

        # Calculate the non events in each group.
        d['Non-Events'] = d['N'] - d['Events']
        # Calculate % of non events in each group.
        d['% of Non-Events'] = np.maximum(d['Non-Events'], 0.5) / d['Non-Events'].sum()

        # Calculate WOE by taking natural log of division of % of non-events and % of events
        d['WoE'] = np.log(d['% of Events']/d['% of Non-Events'])
        d['IV'] = d['WoE'] * (d['% of Events'] - d['% of Non-Events'])
        d.insert(loc=0, column='Variable', value=ivars)
#         print("Information value of " + ivars + " is " + str(round(d['IV'].sum(),6)))
        temp =pd.DataFrame({"Variable" : [ivars], "IV" : [d['IV'].sum()]}, columns = ["Variable", "IV"])
        newDF=pd.concat([newDF,temp], axis=0)
        woeDF=pd.concat([woeDF,d], axis=0)
        newDF['Predictive_power']=np.where(newDF['IV']<0.02,'Useless',
                                           np.where(newDF['IV']<0.1,'Weak',
                                                    np.where(newDF['IV']<0.3,'Medium',
                                                             np.where(newDF['IV']<0.5,'Strong','Too good to be true'
                                          ))))


        #Show WOE Table
        if show_woe == True:
            display(HTML(d.to_html()))
            print('information value of'+ivars+'is='+str(round(d['IV'].sum(),6)))
    display(HTML(newDF.to_html()))
    return newDF, woeDF

In [21]:
cols.remove('Heart_Disease')
iv,woe=iv_woe(data=data1[cols+['Heart_Disease']],target='Heart_Disease',bins=10,show_woe=False)

Unnamed: 0,Variable,IV,Predictive_power
0,General_Health,0.70084,Too good to be true
0,Checkup,0.164133,Medium
0,Exercise,0.108323,Medium
0,Skin_Cancer,0.084866,Weak
0,Other_Cancer,0.087424,Weak
0,Depression,0.013317,Useless
0,Diabetes,0.324345,Strong
0,Arthritis,0.292621,Medium
0,Sex,0.071645,Weak
0,Age_Category,0.997017,Too good to be true
