# 🌲 Decision Tree

# 🚫 Remove Null Values

In [187]:
import pandas as pd

In [188]:
#Let's import data frame
df = pd.read_csv('credit_risk - credit_risk.csv')
df

Unnamed: 0,age,salary,home_ownership,employment_time,loan_purposes,credit_score,Credit_Amount,loan_rate,Credit_Status,loan_percentage,Payment_History,Credit_History_Length
0,23,70000,RENT,3.0,EDUCATION,A,12000,7.90,0,0.17,N,2
1,25,25000,RENT,0.0,MEDICAL,C,8500,12.53,1,0.34,Y,2
2,24,67000,MORTGAGE,2.0,HOMEIMPROVEMENT,B,12150,10.37,0,0.18,N,3
3,23,69000,RENT,7.0,MEDICAL,A,6000,7.88,0,0.09,N,2
4,30,33000,MORTGAGE,2.0,DEBTCONSOLIDATION,A,6000,8.49,0,0.18,N,8
...,...,...,...,...,...,...,...,...,...,...,...,...
32576,32,58800,OWN,16.0,PERSONAL,C,7000,13.99,0,0.12,N,7
32577,33,30000,RENT,2.0,VENTURE,E,6000,19.42,1,0.20,N,10
32578,39,18300,RENT,0.0,DEBTCONSOLIDATION,D,1200,14.74,1,0.07,Y,11
32579,28,36000,MORTGAGE,,VENTURE,D,6000,15.21,0,0.17,N,7


In [189]:
#Let's examine null values
df.isnull().sum()

age                         0
salary                      0
home_ownership              0
employment_time           895
loan_purposes               0
credit_score                0
Credit_Amount               0
loan_rate                3116
Credit_Status               0
loan_percentage             0
Payment_History             0
Credit_History_Length       0
dtype: int64

In [190]:
#Let's examine data types of variables
df.dtypes

age                        int64
salary                     int64
home_ownership            object
employment_time          float64
loan_purposes             object
credit_score              object
Credit_Amount              int64
loan_rate                float64
Credit_Status              int64
loan_percentage          float64
Payment_History           object
Credit_History_Length      int64
dtype: object

In [191]:
#Let's look at shape of data frame
df.shape

(32581, 12)

In [192]:
#Let's use label encoder for encoding object variables
from sklearn.preprocessing import LabelEncoder

df_object = df.select_dtypes(include='object')
columns_to_encode = df_object.columns

# Apply label encoding to selected columns
label_encoder = LabelEncoder()
for col in columns_to_encode:
    df[col] = label_encoder.fit_transform(df[col])

In [193]:
#Let's drop null values
df_n = df.dropna()

In [194]:
#Let's shuffly dataset for choose randomly
df_n.sample(frac = 1.0)

Unnamed: 0,age,salary,home_ownership,employment_time,loan_purposes,credit_score,Credit_Amount,loan_rate,Credit_Status,loan_percentage,Payment_History,Credit_History_Length
3008,34,36000,3,3.0,5,2,3600,13.49,0,0.10,1,7
28751,24,55000,0,3.0,4,0,6000,7.51,0,0.11,0,4
28776,23,115000,0,7.0,1,2,10000,14.27,0,0.09,0,2
31616,23,96000,0,7.0,0,0,5000,7.51,0,0.05,0,2
4601,23,140000,3,5.0,0,1,12000,11.12,0,0.09,0,3
...,...,...,...,...,...,...,...,...,...,...,...,...
1992,23,67200,3,7.0,1,2,5650,13.47,0,0.08,0,2
14672,23,25000,3,3.0,5,1,10000,11.83,1,0.40,0,3
29812,24,72400,0,4.0,4,4,25000,17.99,0,0.35,1,2
5411,27,36000,3,0.0,0,2,8000,13.85,1,0.22,1,6


In [195]:
#Let's look at shape of new data frame
df_n.shape

(28638, 12)

In [196]:
#Let's divide dataset target and explanatory values
df_ny = df_n['Credit_Status']
del df_n['Credit_Status']
df_nx = df_n

In [197]:
#Let's divide data train and test
faiz = 0.8
size = len(df_nx) 
train_x = df_nx[1:int(size*faiz)]
train_y = df_ny[1:int(size*faiz)]
test_x = df_nx[int(size*faiz):]
test_y = df_ny[int(size*faiz):]

In [198]:
#Let's find shape of dataset
for dfsh in [train_x, train_y,test_x,test_y]:
    print(dfsh.shape)

(22909, 11)
(22909,)
(5728, 11)
(5728,)


In [199]:
#Let's apply decision tree to this data frame
from sklearn import tree
from sklearn.tree import DecisionTreeClassifier
dtree = DecisionTreeClassifier()
dtree = dtree.fit(train_x,train_y)

In [200]:
#Let's create predict values
predict = dtree.predict(test_x)
predict = pd.DataFrame(predict) 
predict.shape

(5728, 1)

In [201]:
test_y = pd.DataFrame(test_y) 
test_y.shape

(5728, 1)

In [202]:
#Let's concat two data frame , predict and test_y
df_score = pd.concat([predict.reset_index(drop=True), test_y.reset_index(drop=True)], axis=1)
df_score.columns = ['Predict','Real Output' ]
df_score

Unnamed: 0,Predict,Real Output
0,0,0
1,1,0
2,1,1
3,0,0
4,0,0
...,...,...
5723,0,0
5724,0,0
5725,1,1
5726,1,1


In [203]:
#Let's find accuracy score for test_y and predict 
from sklearn.metrics import accuracy_score

accuracy = accuracy_score(test_y, predict)
print("Accuracy Score:", accuracy)

Accuracy Score: 0.8868715083798883


# 🔄 Null replace

In [204]:
#Let's import data frame again
df = pd.read_csv('credit_risk - credit_risk.csv')
df

Unnamed: 0,age,salary,home_ownership,employment_time,loan_purposes,credit_score,Credit_Amount,loan_rate,Credit_Status,loan_percentage,Payment_History,Credit_History_Length
0,23,70000,RENT,3.0,EDUCATION,A,12000,7.90,0,0.17,N,2
1,25,25000,RENT,0.0,MEDICAL,C,8500,12.53,1,0.34,Y,2
2,24,67000,MORTGAGE,2.0,HOMEIMPROVEMENT,B,12150,10.37,0,0.18,N,3
3,23,69000,RENT,7.0,MEDICAL,A,6000,7.88,0,0.09,N,2
4,30,33000,MORTGAGE,2.0,DEBTCONSOLIDATION,A,6000,8.49,0,0.18,N,8
...,...,...,...,...,...,...,...,...,...,...,...,...
32576,32,58800,OWN,16.0,PERSONAL,C,7000,13.99,0,0.12,N,7
32577,33,30000,RENT,2.0,VENTURE,E,6000,19.42,1,0.20,N,10
32578,39,18300,RENT,0.0,DEBTCONSOLIDATION,D,1200,14.74,1,0.07,Y,11
32579,28,36000,MORTGAGE,,VENTURE,D,6000,15.21,0,0.17,N,7


In [205]:
#Let's examine null values
df.isnull().sum()

age                         0
salary                      0
home_ownership              0
employment_time           895
loan_purposes               0
credit_score                0
Credit_Amount               0
loan_rate                3116
Credit_Status               0
loan_percentage             0
Payment_History             0
Credit_History_Length       0
dtype: int64

In [206]:
#Let's examine mean of employment_time column
df['employment_time'].mean()

4.789686296787225

In [207]:
#Let's group the ages where employment_time is null and find the mean employment_time for them
age_d = {}
for age in df[df['employment_time'].isnull()]['age']:
    age_d[age] = df[df['age'] == age]['employment_time'].mean()
    
age_d

{25: 4.653794037940379,
 23: 3.934817170111288,
 22: 3.6538789428815006,
 26: 4.843388429752066,
 39: 6.054794520547945,
 37: 5.76545842217484,
 21: 3.299660441426146,
 24: 4.317800289435601,
 27: 4.894033412887828,
 28: 5.468144044321329,
 33: 5.686298076923077,
 32: 5.680084745762712,
 29: 5.361482381530984,
 35: 5.518151815181518,
 30: 5.606635071090047,
 38: 5.704419889502763,
 36: 5.9812734082397006,
 40: 5.934865900383142,
 47: 6.087912087912088,
 49: 6.5,
 42: 6.297297297297297,
 34: 5.865412445730825,
 31: 5.3495495495495495,
 43: 5.968152866242038,
 44: 6.402985074626866,
 41: 6.32051282051282,
 48: 6.378378378378378,
 56: 6.0,
 51: 4.526315789473684,
 50: 6.0,
 70: 0.8333333333333334,
 53: 6.0344827586206895}

In [208]:
#Let's fill the null values with the average employment_time of people of the same age
df['employment_time'] = df.groupby('age')['employment_time'].transform(lambda x: x.fillna(x.mean()))

In [209]:
#Let's check sum of null values
df.isnull().sum()

age                         0
salary                      0
home_ownership              0
employment_time             0
loan_purposes               0
credit_score                0
Credit_Amount               0
loan_rate                3116
Credit_Status               0
loan_percentage             0
Payment_History             0
Credit_History_Length       0
dtype: int64

In [210]:
#Let's examine correlations between loan_rate and another columns and find the best correlation score
df.corr()['loan_rate']

  df.corr()['loan_rate']


age                      0.012580
salary                   0.000792
employment_time         -0.054992
Credit_Amount            0.146813
loan_rate                1.000000
Credit_Status            0.335133
loan_percentage          0.120314
Credit_History_Length    0.016696
Name: loan_rate, dtype: float64

In [211]:
#Fill the null values with the average work_time of people of the same age
df['loan_rate'] = df.groupby('Credit_Status')['loan_rate'].transform(lambda x: x.fillna(x.mean()))

In [212]:
#Let's use label encoder for encoding object variables
from sklearn.preprocessing import LabelEncoder

df_object = df.select_dtypes(include='object')
columns_to_encode = df_object.columns

# Apply label encoding to selected columns
label_encoder = LabelEncoder()
for col in columns_to_encode:
    df[col] = label_encoder.fit_transform(df[col])

print(df)

       age  salary  home_ownership  employment_time  loan_purposes  \
0       23   70000               3         3.000000              1   
1       25   25000               3         0.000000              3   
2       24   67000               0         2.000000              2   
3       23   69000               3         7.000000              3   
4       30   33000               0         2.000000              0   
...    ...     ...             ...              ...            ...   
32576   32   58800               2        16.000000              4   
32577   33   30000               3         2.000000              5   
32578   39   18300               3         0.000000              0   
32579   28   36000               0         5.468144              5   
32580   21   42000               0         6.000000              1   

       credit_score  Credit_Amount  loan_rate  Credit_Status  loan_percentage  \
0                 0          12000       7.90              0             0.17 

In [213]:
#Let's assign emplanotory and target values
y = df['Credit_Status']
del df['Credit_Status']
X = df

In [214]:
#Let's use train test split 
from sklearn.model_selection import train_test_split
train_x,test_x,train_y, test_y = train_test_split(X,y, test_size = 0.2 , random_state = 42)

In [215]:
train_y

32377    0
1338     0
7047     0
8225     0
7178     1
        ..
29802    1
5390     0
860      0
15795    0
23654    0
Name: Credit_Status, Length: 26064, dtype: int64

In [216]:
#Let's apply decision tree to this data frame
from sklearn import tree
from sklearn.tree import DecisionTreeClassifier
dtree = DecisionTreeClassifier()
dtree = dtree.fit(train_x,train_y)

In [217]:
#Let's predict test_x
y_pred_test = dtree.predict(test_x)

In [218]:
#Let's find accuracy score for test_y and y_pred_test
from sklearn.metrics import accuracy_score

print("Accuracy: " ,accuracy_score(test_y,y_pred_test))

Accuracy:  0.9002608562221881


In [219]:
#Let's add real y and predict y values to test_x data frame
test_x['Predict'] = y_pred_test
test_x['Real'] = test_y

In [220]:
#Let's add correct and 4 measurement
#TP - True Positive --> Not churn and correct predict
#TN - True Negative --> Churn and correct predict
#FP - False Positive --> Not churn and wrong predict
#FN - False Negative --> Churn and wrong predict

test_x['Correct'] = test_x['Predict'] == test_x['Real']
test_x['TP'] = (test_x['Predict'] == 1) & (test_x['Real'] == 1)
test_x['TN'] = (test_x['Predict'] == 0) & (test_x['Real'] == 0)
test_x['FP'] = (test_x['Predict'] == 1) & (test_x['Real'] == 0)
test_x['FN'] = (test_x['Predict'] == 0) & (test_x['Real'] == 1)

In [222]:
#Let's convert new measure columns to int values

for column in ['Correct','TP','TN','FP','FN']:
    test_x[column] = test_x[column].astype(int)

In [223]:
#Let's look at test set
test_x

Unnamed: 0,age,salary,home_ownership,employment_time,loan_purposes,credit_score,Credit_Amount,loan_rate,loan_percentage,Payment_History,Credit_History_Length,Predict,Real,Correct,TP,TN,FP,FN
14668,24,30000,3,0.0,1,0,1000,7.51,0.03,0,2,0,0,1,0,1,0,0
24614,22,60000,3,5.0,1,1,15000,10.59,0.25,0,3,0,0,1,0,1,0,0
11096,32,113000,3,5.0,2,3,12000,16.29,0.11,0,5,0,1,0,0,0,0,1
10424,31,52000,0,6.0,1,0,10000,8.49,0.19,0,5,0,0,1,0,1,0,0
26007,23,200000,0,0.0,1,0,5000,7.43,0.03,0,2,0,0,1,0,1,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
31330,24,55000,0,8.0,1,1,6000,11.99,0.11,0,2,0,1,0,0,0,0,1
2862,36,50000,3,3.0,5,1,3000,11.26,0.06,0,11,0,0,1,0,1,0,0
14754,34,50000,3,0.0,2,2,15000,12.72,0.30,1,9,0,0,1,0,1,0,0
14170,31,47000,0,0.0,3,2,10000,11.03,0.21,1,7,0,0,1,0,1,0,0


In [154]:
#Let's find mean of measurements
#Correct values must be equal to accuracy score

for column in ['Correct','TP','TN','FP','FN']:
    print(f"{column} : {test_x[column].mean()}")

Correct : 0.8973454043271444
TP : 0.16970998925886144
TN : 0.727635415068283
FP : 0.05830903790087463
FN : 0.04434555777198097


In [159]:
#Let's find accuracy score from 4 measurements
# (TP+TN)/(TP+TN+FP+FN)
    
accuracy = (test_x['TP'].mean()+test_x['TN'].mean())/1
accuracy

0.8973454043271445

# ⚠️ Clean Outlier values

In [168]:
#Let's import data frame again
df = pd.read_csv('credit_risk - credit_risk.csv')
df

Unnamed: 0,age,salary,home_ownership,employment_time,loan_purposes,credit_score,Credit_Amount,loan_rate,Credit_Status,loan_percentage,Payment_History,Credit_History_Length
0,23,70000,RENT,3.0,EDUCATION,A,12000,7.90,0,0.17,N,2
1,25,25000,RENT,0.0,MEDICAL,C,8500,12.53,1,0.34,Y,2
2,24,67000,MORTGAGE,2.0,HOMEIMPROVEMENT,B,12150,10.37,0,0.18,N,3
3,23,69000,RENT,7.0,MEDICAL,A,6000,7.88,0,0.09,N,2
4,30,33000,MORTGAGE,2.0,DEBTCONSOLIDATION,A,6000,8.49,0,0.18,N,8
...,...,...,...,...,...,...,...,...,...,...,...,...
32576,32,58800,OWN,16.0,PERSONAL,C,7000,13.99,0,0.12,N,7
32577,33,30000,RENT,2.0,VENTURE,E,6000,19.42,1,0.20,N,10
32578,39,18300,RENT,0.0,DEBTCONSOLIDATION,D,1200,14.74,1,0.07,Y,11
32579,28,36000,MORTGAGE,,VENTURE,D,6000,15.21,0,0.17,N,7


In [161]:
df.corr()

  df.corr()


Unnamed: 0,age,salary,employment_time,Credit_Amount,loan_rate,Credit_Status,loan_percentage,Credit_History_Length
age,1.0,0.173202,0.163106,0.050787,0.01258,-0.021629,-0.042411,0.859133
salary,0.173202,1.0,0.134268,0.26682,0.000792,-0.144449,-0.254471,0.117987
employment_time,0.163106,0.134268,1.0,0.113082,-0.056405,-0.082489,-0.054111,0.144699
Credit_Amount,0.050787,0.26682,0.113082,1.0,0.146813,0.105376,0.572612,0.041967
loan_rate,0.01258,0.000792,-0.056405,0.146813,1.0,0.335133,0.120314,0.016696
Credit_Status,-0.021629,-0.144449,-0.082489,0.105376,0.335133,1.0,0.379366,-0.015529
loan_percentage,-0.042411,-0.254471,-0.054111,0.572612,0.120314,0.379366,1.0,-0.03169
Credit_History_Length,0.859133,0.117987,0.144699,0.041967,0.016696,-0.015529,-0.03169,1.0


In [172]:
for i in ['salary','employment_time','Credit_Amount','loan_rate']:
    below_ca = df[i].mean() - 3*df[i].std() 
    above_ca = df[i].mean() + 3*df[i].std()
    df = df[(df[i] <= above_ca) | (df[i] >= below_ca)]

In [175]:
df.shape

(28638, 12)

In [177]:
df['employment_time'] = df.groupby('age')['employment_time'].transform(lambda x: x.fillna(x.mean()))
df['loan_rate'] = df.groupby('Credit_Status')['loan_rate'].transform(lambda x: x.fillna(x.mean()))

In [179]:
#Let's use label encoder for encoding object variables
from sklearn.preprocessing import LabelEncoder

df_object = df.select_dtypes(include='object')
columns_to_encode = df_object.columns

# Apply label encoding to selected columns
label_encoder = LabelEncoder()
for col in columns_to_encode:
    df[col] = label_encoder.fit_transform(df[col])

In [180]:
#Let's assign emplanotory and target values
y = df['Credit_Status']
del df['Credit_Status']
X = df

In [181]:
#Let's use train test split 
from sklearn.model_selection import train_test_split
train_x,test_x,train_y, test_y = train_test_split(X,y, test_size = 0.2 , random_state = 42)

In [182]:
#Let's apply decision tree to this data frame
from sklearn import tree
from sklearn.tree import DecisionTreeClassifier
dtree = DecisionTreeClassifier()
dtree = dtree.fit(train_x,train_y)

In [183]:
#Let's predict test_x
y_pred_test = dtree.predict(test_x)

In [184]:
#Let's find accuracy score for test_y and y_pred_test
from sklearn.metrics import accuracy_score

print("Accuracy: " ,accuracy_score(test_y,y_pred_test))

Accuracy:  0.8833798882681564


## 🔖 The best result: Null replace

# 🚧 The End