In [55]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import roc_auc_score

# 데이터 탐색
df = pd.read_csv('data/cs_data.csv', index_col='Unnamed: 0')
# print(df.info())


# 데이터 전처리
## nan 삭제
df2 = df.dropna(axis=0)
# df2= df2.astype({'SeriousDlqin2yrs':'object'})
# print(df2.info())
# print(df2['SeriousDlqin2yrs'].unique())


## x, y 분리 및 train test 분리
x = df2.drop(columns='SeriousDlqin2yrs')
y = df2['SeriousDlqin2yrs']

x_train, x_test, y_train, y_test = train_test_split(x, y, test_size= 0.2, stratify=y, random_state=0)
# print(x_train.shape, x_test.shape)
# print(y_train.shape, y_test.shape)



# 머신러닝 
rfc = RandomForestClassifier()
rfc.fit(x_train, y_train)

pred_train = rfc.predict(x_train)
pred_test = rfc.predict(x_test)

pred_train_proba = rfc.predict_proba(x_train)
pred_test_proba = rfc.predict_proba(x_test)

# print(pred_train, pred_train_proba)
# print(pred_test, pred_test_proba)


# 평가지표 ROC-AUC / 목표 0.85이상
train_score = roc_auc_score(y_train, pred_train_proba[:,1])
test_score = roc_auc_score(y_test, pred_test_proba[:,1])

print('Train Score:', train_score)
print('Test Score:', test_score)

fi = pd.Series(rfc.feature_importances_, index=x.columns).sort_values(ascending=False)
print(fi)


Train Score: 0.9999960281721225
Test Score: 0.8316232239282009
RevolvingUtilizationOfUnsecuredLines    0.186373
DebtRatio                               0.174193
MonthlyIncome                           0.162886
age                                     0.121884
NumberOfOpenCreditLinesAndLoans         0.089088
NumberOfTimes90DaysLate                 0.087444
NumberOfTime30-59DaysPastDueNotWorse    0.051859
NumberOfTime60-89DaysPastDueNotWorse    0.046551
NumberOfDependents                      0.044847
NumberRealEstateLoansOrLines            0.034875
dtype: float64


In [56]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import roc_auc_score

# 데이터 탐색
df = pd.read_csv('data/cs_data.csv', index_col='Unnamed: 0')
# print(df.info())


# 데이터 전처리
## income 은 평균, dependencts 는 중간값으로 변경
df['MonthlyIncome'] = df['MonthlyIncome'].fillna(df['MonthlyIncome'].mean())
df['NumberOfDependents'] = df['NumberOfDependents'].fillna(df['NumberOfDependents'].median())
# print(df.info())


## x, y 분리 및 train test 분리
x = df.drop(columns='SeriousDlqin2yrs')
y = df['SeriousDlqin2yrs']

x_train, x_test, y_train, y_test = train_test_split(x, y, test_size= 0.2, stratify=y, random_state=0)
# print(x_train.shape, x_test.shape)
# print(y_train.shape, y_test.shape)



# 머신러닝 
rfc = RandomForestClassifier()
rfc.fit(x_train, y_train)

pred_train = rfc.predict(x_train)
pred_test = rfc.predict(x_test)

pred_train_proba = rfc.predict_proba(x_train)
pred_test_proba = rfc.predict_proba(x_test)

# print(pred_train, pred_train_proba)
# print(pred_test, pred_test_proba)


# 평가지표 ROC-AUC / 목표 0.85이상
train_score = roc_auc_score(y_train, pred_train_proba[:,1])
test_score = roc_auc_score(y_test, pred_test_proba[:,1])

print('Train Score:', train_score)
print('Test Score:', test_score)

fi = pd.Series(rfc.feature_importances_, index=x.columns).sort_values(ascending=False)
print(fi)


Train Score: 0.9999722823917778
Test Score: 0.8420662400081242
RevolvingUtilizationOfUnsecuredLines    0.195777
DebtRatio                               0.179189
MonthlyIncome                           0.144794
age                                     0.126892
NumberOfOpenCreditLinesAndLoans         0.088706
NumberOfTimes90DaysLate                 0.087898
NumberOfTime30-59DaysPastDueNotWorse    0.050186
NumberOfTime60-89DaysPastDueNotWorse    0.049640
NumberOfDependents                      0.041850
NumberRealEstateLoansOrLines            0.035067
dtype: float64


In [60]:
# 그리디 서치
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import roc_auc_score
from sklearn.model_selection import GridSearchCV


# 데이터 탐색
df = pd.read_csv('data/cs_data.csv', index_col='Unnamed: 0')
# print(df.info())


# 데이터 전처리
## income 은 평균, dependencts 는 중간값으로 변경
df['MonthlyIncome'] = df['MonthlyIncome'].fillna(df['MonthlyIncome'].mean())
df['NumberOfDependents'] = df['NumberOfDependents'].fillna(df['NumberOfDependents'].median())
# print(df.info())


## x, y 분리 및 train test 분리
x = df.drop(columns='SeriousDlqin2yrs')
y = df['SeriousDlqin2yrs']

x_train, x_test, y_train, y_test = train_test_split(x, y, test_size= 0.2, stratify=y, random_state=0)
# print(x_train.shape, x_test.shape)
# print(y_train.shape, y_test.shape)



# 머신러닝 
model = RandomForestClassifier(random_state=0)
params = {
    'n_estimators' : [300, 400, 500, 600, 700],
    'max_depth': [3,4,5, 6, 7],
}

gs = GridSearchCV(
    estimator=model,
    param_grid=params,
    scoring='roc_auc',
    cv=4,
    n_jobs=-1
)


gs.fit(x_train, y_train)
print(gs.best_params_)
print(gs.best_estimator_)
print(gs.best_score_)


best_model = gs.best_estimator_

pred_train = best_model.predict(x_train)
pred_test = best_model.predict(x_test)

pred_train_proba = best_model.predict_proba(x_train)
pred_test_proba = best_model.predict_proba(x_test)


# 평가지표 ROC-AUC / 목표 0.85이상
train_score = roc_auc_score(y_train, pred_train_proba[:,1])
test_score = roc_auc_score(y_test, pred_test_proba[:,1])

print('Train Score:', train_score)
print('Test Score:', test_score)





{'max_depth': 7, 'n_estimators': 600}
RandomForestClassifier(max_depth=7, n_estimators=600, random_state=0)
0.8614525647530895
Train Score: 0.8687790882843358
Test Score: 0.8631080363032408
