In [124]:
# Python ≥3.5 is required
import sys
assert sys.version_info >= (3, 5)

# Scikit-Learn ≥0.20 is required
import sklearn
assert sklearn.__version__ >= "0.20"

# Common imports
import numpy as np
import os

#importo libreria panda per leggere ed elaborare csv
import pandas as pd

# to make this notebook's output stable across runs
np.random.seed(42)

# To plot pretty figures
%matplotlib inline
import matplotlib as mpl
import matplotlib.pyplot as plt
mpl.rc('axes', labelsize=14)
mpl.rc('xtick', labelsize=12)
mpl.rc('ytick', labelsize=12)

# Where to save the figures
PROJECT_ROOT_DIR = "."
CHAPTER_ID = "decision_trees"
IMAGES_PATH = os.path.join(PROJECT_ROOT_DIR, "images", CHAPTER_ID)
os.makedirs(IMAGES_PATH, exist_ok=True)

def save_fig(fig_id, tight_layout=True, fig_extension="png", resolution=300):
    path = os.path.join(IMAGES_PATH, fig_id + "." + fig_extension)
    print("Saving figure", fig_id)
    if tight_layout:
        plt.tight_layout()
    plt.savefig(path, format=fig_extension, dpi=resolution)




In [125]:
data=pd.read_csv('/Users/ragno/Documents/Progetti/Mldm/ProgettoMLDM/Dataset/Loan_Default.csv')


In [126]:
#elimino dati poco significanti

cols=['ID','year','Interest_rate_spread','rate_of_interest','Upfront_charges','Gender','dtir1']
data=data.drop(cols,axis=1)




In [127]:
data['Status'].value_counts()

0    112031
1     36639
Name: Status, dtype: int64

In [128]:
from sklearn.preprocessing import OrdinalEncoder
from sklearn.impute import SimpleImputer


#converto in valori numerici
colonne=['loan_limit','loan_type','approv_in_adv','loan_purpose','Credit_Worthiness','business_or_commercial','occupancy_type','credit_type','co-applicant_credit_type','submission_of_application','Region','Neg_ammortization','interest_only','lump_sum_payment','age','open_credit','construction_type','Secured_by','total_units','Security_Type']
imputer = SimpleImputer(strategy='most_frequent') 

data[colonne]=imputer.fit_transform(data[colonne])

numerical=['term','income','submission_of_application','property_value','construction_type','Secured_by','total_units','Security_Type','LTV']

imputer = SimpleImputer(strategy='most_frequent') 
data[numerical]=imputer.fit_transform(data[numerical])

encoder = OrdinalEncoder(dtype=np.int16)
data[colonne] = encoder.fit_transform(data[colonne])





In [129]:
pd.set_option('display.max_columns', None)
data.isna().sum()

loan_limit                   0
approv_in_adv                0
loan_type                    0
loan_purpose                 0
Credit_Worthiness            0
open_credit                  0
business_or_commercial       0
loan_amount                  0
term                         0
Neg_ammortization            0
interest_only                0
lump_sum_payment             0
property_value               0
construction_type            0
occupancy_type               0
Secured_by                   0
total_units                  0
income                       0
credit_type                  0
Credit_Score                 0
co-applicant_credit_type     0
age                          0
submission_of_application    0
LTV                          0
Region                       0
Security_Type                0
Status                       0
dtype: int64

In [130]:
data['Status'].value_counts()

0    112031
1     36639
Name: Status, dtype: int64

In [131]:
from sklearn.model_selection import train_test_split


In [132]:
from sklearn.tree import DecisionTreeClassifier

X=data.drop('Status',axis=1)
y=data['Status']

x_train, x_test, y_train, y_test = train_test_split(X, y, train_size=0.8)



tree_clf = DecisionTreeClassifier(max_depth=6, random_state=42)
tree_clf.fit(x_train, y_train)

tree_clf.score(x_test, y_test)

0.8704513351718571

In [133]:
x_train.count()

loan_limit                   118936
approv_in_adv                118936
loan_type                    118936
loan_purpose                 118936
Credit_Worthiness            118936
open_credit                  118936
business_or_commercial       118936
loan_amount                  118936
term                         118936
Neg_ammortization            118936
interest_only                118936
lump_sum_payment             118936
property_value               118936
construction_type            118936
occupancy_type               118936
Secured_by                   118936
total_units                  118936
income                       118936
credit_type                  118936
Credit_Score                 118936
co-applicant_credit_type     118936
age                          118936
submission_of_application    118936
LTV                          118936
Region                       118936
Security_Type                118936
dtype: int64

In [134]:
x_test.count()

loan_limit                   29734
approv_in_adv                29734
loan_type                    29734
loan_purpose                 29734
Credit_Worthiness            29734
open_credit                  29734
business_or_commercial       29734
loan_amount                  29734
term                         29734
Neg_ammortization            29734
interest_only                29734
lump_sum_payment             29734
property_value               29734
construction_type            29734
occupancy_type               29734
Secured_by                   29734
total_units                  29734
income                       29734
credit_type                  29734
Credit_Score                 29734
co-applicant_credit_type     29734
age                          29734
submission_of_application    29734
LTV                          29734
Region                       29734
Security_Type                29734
dtype: int64

In [135]:
y_test.describe()

count    29734.000000
mean         0.243492
std          0.429197
min          0.000000
25%          0.000000
50%          0.000000
75%          0.000000
max          1.000000
Name: Status, dtype: float64

In [136]:
y_train

141245    0
3507      0
53688     0
46491     1
54671     0
         ..
119879    1
103694    0
131932    0
146867    0
121958    1
Name: Status, Length: 118936, dtype: int64