In [None]:
import numpy as np
import pandas as pd
pd.set_option('display.max_columns',50)

import matplotlib.pyplot as plt
import seaborn as sns

In [None]:
train=pd.read_csv('/kaggle/input/daconcard/open/train.csv')
test=pd.read_csv('/kaggle/input/daconcard/open/test.csv')
submit=pd.read_csv('/kaggle/input/daconcard/open/sample_submission.csv')

In [None]:
def datainfo(df):
    return pd.DataFrame([(col,df[col].dtype,df[col].isna().sum(),df[col].nunique(),df[col].unique()[:5]) for col in df.columns],
                       columns=['name','dtype','missing','nunique','values :5'])

In [None]:
datainfo(train)

In [None]:
datainfo(test)

In [None]:
datainfo(submit)

In [None]:
## 제거 
train=train.drop(['FLAG_MOBIL','index'],axis=1)
test=test.drop(['FLAG_MOBIL','index'],axis=1)

## 중복 확인 및 제거
cols=['gender', 'car', 'reality', 'child_num', 'income_total',
      'income_type', 'edu_type', 'family_type', 'house_type', 'DAYS_BIRTH',
      'DAYS_EMPLOYED','work_phone', 'phone', 'email',
      'occyp_type', 'family_size']

## 마지막에 입력된 데이터가 바뀌어도 진짜라고 가정.
train=train.drop_duplicates(subset=cols,keep='last')
## 고용일 처리 
train['DAYS_EMPLOYED']=-train['DAYS_EMPLOYED']
test['DAYS_EMPLOYED']=-test['DAYS_EMPLOYED']
train.loc[train['DAYS_EMPLOYED']<0,'DAYS_EMPLOYED']=-999
test.loc[test['DAYS_EMPLOYED']<0,'DAYS_EMPLOYED']=-999
## 가족수
train=train[(train['family_size']!=20)|(train['family_size']!=15)]
# 결측값 
train['occyp_type']=train['occyp_type'].fillna('unknown')
test['occyp_type']=test['occyp_type'].fillna('unknown')

In [None]:
train=pd.get_dummies(train,drop_first=True)
test=pd.get_dummies(test,drop_first=True)

In [None]:
train.shape,test.shape

In [None]:
X=train.drop('credit',axis=1)
y=train['credit']

In [None]:
from sklearn.model_selection import train_test_split

X_train,X_test,y_train,y_test=train_test_split(X,y,test_size=0.15,random_state=71,stratify=y)

In [None]:
import xgboost as xgb

params={
    'objective':'binary:mlogloss',
    'random_state':71
}
model=xgb.XGBClassifier(**params)

model.fit(X_train,y_train,
          eval_set=[(X_train, y_train),(X_test,y_test)],
          eval_metric='mlogloss',verbose=True,
          early_stopping_rounds=20)

In [None]:
xgb.plot_importance(model,max_num_features=10)

In [None]:
pred=pd.DataFrame(model.predict_proba(test))
pred['index']=submit['index']
pred=pred[['index',0,1,2]]
pred.head()

In [None]:
pred.to_csv('20210501_xgboost_earlystop.csv',index=False)

In [None]:
## income_total 정규성 x -> log 변환 한것을 파생변수로.
## 다른건 몰라도 income_type이 pensioner인 사람들이 DAYS_BIRTH가 오래됐다. 
## -> DAYS_BIRTH는 -7500정도에서 -25000정도 사이 나이로 바꿀 필요가 있을까? -> 보류
## edu_type에 따라 income_total과 DAYS_BIRTH는 선형관계가 있다. 
## DAYS_EMPLOYED 양수 값은 고용되지 않은 상태를 의미함 -> 제거 
## family type과 성별에 따라 income_total의 차이가 없다. 
## FLAG_MOBIL 1밖에 없다. -> 제거 
## 중복값 제거
## family_size child_num 상관관계 -> 제거,income_total income_total_log 제거 그 외 그냥 둔다.


sns.jointplot(data=train, x="income_total_log", y="DAYS_BIRTH", hue="family_type")
# sns.displot(data=train, x="income_total", col="DAYS_BIRTH", kde=True)
# sns.relplot(data=train,x='income_total',y='DAYS_BIRTH',hue='edu_tpe')

In [None]:
## 연속형 - 연속
sns.relplot(
    data=train,
    x="income_total", y="tip", col="time",
    hue="smoker", style="smoker", size="size",
)

sns.lmplot(data=tips, x="total_bill", y="tip", col="time", hue="smoker")
sns.displot(data=tips, x="total_bill", col="time", kde=True)
sns.jointplot(data=penguins, x="flipper_length_mm", y="bill_length_mm", hue="species")

In [None]:
sns.catplot(data=train,kind='violin',x='FLAG_MOBIL',y='income_total_log',hue='credit')

In [None]:
sns.catplot(data=train, kind="violin", x="gender", y="income_total_log", hue="family_type",split=False)

In [None]:
## 범주 - 연속
sns.catplot(data=tips, kind="swarm", x="day", y="total_bill", hue="smoker",split=True)