# ML Models

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

from sklearn.metrics import accuracy_score
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.naive_bayes import BernoulliNB
from xgboost import XGBClassifier
from lightgbm.sklearn import LGBMClassifier
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis



import warnings
warnings.filterwarnings("ignore")

In [2]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [3]:
import os
os.chdir('/content/drive/MyDrive/likelion/credit-now') # 작업 폴더 있는 곳까지 이동 
os.getcwd()

import credit_data

In [4]:
train_data, test_data, train_label, test_label = credit_data.load_data()
print(train_data.shape, test_data.shape, train_label.shape, test_label.shape)

(15906, 67) (6818, 67) (15906, 1) (6818, 1)


<br>
<br>

# PyCaret을 활용한 low-code machine learning

<br>
<br>

#### 필수 라이브러리 설치

In [5]:
!pip install pycaret==2.3.10

Collecting jinja2>=2.11.1
  Using cached Jinja2-2.11.3-py2.py3-none-any.whl (125 kB)
Installing collected packages: jinja2
  Attempting uninstall: jinja2
    Found existing installation: Jinja2 3.1.2
    Uninstalling Jinja2-3.1.2:
      Successfully uninstalled Jinja2-3.1.2
[31mERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.
google-colab 1.0.0 requires requests~=2.23.0, but you have requests 2.27.1 which is incompatible.
datascience 0.10.6 requires folium==0.2.1, but you have folium 0.8.3 which is incompatible.[0m
Successfully installed jinja2-2.11.3


In [6]:
pip install jinja2==3.1.2

Collecting jinja2==3.1.2
  Using cached Jinja2-3.1.2-py3-none-any.whl (133 kB)
Installing collected packages: jinja2
  Attempting uninstall: jinja2
    Found existing installation: Jinja2 2.11.3
    Uninstalling Jinja2-2.11.3:
      Successfully uninstalled Jinja2-2.11.3
[31mERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.
google-colab 1.0.0 requires requests~=2.23.0, but you have requests 2.27.1 which is incompatible.
flask 1.1.4 requires Jinja2<3.0,>=2.10.1, but you have jinja2 3.1.2 which is incompatible.
datascience 0.10.6 requires folium==0.2.1, but you have folium 0.8.3 which is incompatible.[0m
Successfully installed jinja2-3.1.2


In [7]:
pip install xgboost==1.6.0




[런타임 다시 시작!!]

## Pycaret code

In [8]:
from pycaret.classification import *

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

In [9]:
train_data

<15906x67 sparse matrix of type '<class 'numpy.float64'>'
	with 270402 stored elements in Compressed Sparse Row format>

In [10]:
os.getcwd()

'/content/drive/Othercomputers/My MacBook Air/likelion/credit-now'

In [11]:
path = '/content/drive/MyDrive/likelion/credit-now/credit_data/'

In [12]:
train_data = pd.read_csv(path + 'train_data.csv')
train_data.head()

Unnamed: 0,index,gender,car,reality,child_num,income_total,income_type,edu_type,family_type,house_type,age,employed_year,work_phone,phone,email,occyp_type,family_size,begin_year,credit
0,1,1,0,1,1,247500.0,1,0,2,0,31.178082,4.219178,0,0,1,1,3,0.416667,1
1,2,0,1,1,0,450000.0,0,1,0,0,52.293151,12.147945,0,1,0,4,2,1.833333,2
2,3,1,0,1,0,202500.0,1,0,0,0,41.336986,5.731507,0,1,0,3,2,3.083333,0
3,4,1,1,1,0,157500.0,3,1,0,0,41.19726,5.767123,0,0,0,4,2,2.166667,2
4,5,1,0,1,2,270000.0,0,0,0,0,36.747945,13.687671,0,0,1,6,4,1.5,1


In [13]:
train_cat_data = pd.read_csv(path + 'train_cat_data.csv')

In [None]:
model = setup(data = train_data, 
              target = 'credit',
              train_size = 0.7,
              session_id = 9) 

IntProgress(value=0, description='Processing: ', max=3)

Text(value="Following data types have been inferred automatically, if they are correct press enter to continue…

Unnamed: 0,Data Type
index,Numeric
gender,Categorical
car,Categorical
reality,Categorical
child_num,Categorical
income_total,Numeric
income_type,Categorical
edu_type,Categorical
family_type,Categorical
house_type,Categorical


In [None]:
# logloss 적용 및 predict_proba로 제출하기 위해 metric 추가 
add_metric('logloss', 'LogLoss', log_loss, greater_is_better = False, target = 'pred_proba')

In [None]:
# svm, ridge는 predict_proba 미지원으로 제외
top_5_models = compare_models(sort = 'logloss',
                              n_select = 5,
                              exclude = ['svm', 'ridge'])

In [None]:
xgb = create_model('xgboost') # creates a model and scores it using stratified cross validation (가능한 모델estimator 리스트는 docstring 참고)

In [None]:
xgb_tuned = tune_model(xgb, optimize='Accuracy') # tunes the hyperparameters of a model on a pre-defined search space and scores it using stratified cross validation

In [None]:
top_5_models

In [None]:
evaluate_model(top_5_models[0]) # 설정된 Hyper-params의 값, AUC/Confusion matrix 등 성능 지표, Feature importance 등을 바로 확인할 수 있음

In [None]:
# evaluate_model로 얻어지는 plot을 직접 지정하여 별도로 출력할 수 있음

model_top = top_5_models[0]

plot_model(model_top, plot = 'auc')
# plot_model(model_top, plot = 'pr')
# plot_model(model_top, plot='feature')
# plot_model(model_top, plot = 'confusion_matrix')

In [None]:
predict_model(top_5_models[0]) # 미리 제외시켜두었던 test data에 대한 예측 결과값