In [48]:
import pandas as pd
pd.DataFrame.iteritems = pd.DataFrame.items
import numpy as np

# Data Load

In [49]:
train = pd.read_csv("data/train.csv")
test = pd.read_csv("data/test.csv")

In [50]:
train_x = train.drop(columns=['ID', 'TARGET']) 
train_y = train['TARGET']

test_x = test.drop(columns=['ID'])

In [51]:
train_y.value_counts()

TARGET
0    53572
1     6428
Name: count, dtype: int64

# Preprocessing

In [13]:
train.isnull().sum()

ID                0
TARGET            0
성별                0
차량 소유 여부          0
부동산 소유 여부         0
자녀 수              0
연간 수입             0
수입 유형             0
최종 학력             0
결혼 여부             0
주거 형태             0
거주지 인구 비율         0
휴대전화 소유 여부        0
업무용 휴대전화 소유 여부    0
이메일 소유 여부         0
직업                0
가족 구성원 수          0
산업군               0
나이                0
근속연수              0
가입연수              0
dtype: int64

In [15]:
test.isnull().sum()

ID                0
성별                0
차량 소유 여부          0
부동산 소유 여부         0
자녀 수              0
연간 수입             0
수입 유형             0
최종 학력             0
결혼 여부             0
주거 형태             0
거주지 인구 비율         0
휴대전화 소유 여부        0
업무용 휴대전화 소유 여부    0
이메일 소유 여부         0
직업                0
가족 구성원 수          0
산업군               0
나이                0
근속연수              0
가입연수              0
dtype: int64

In [17]:
train.duplicated().sum()

0

In [18]:
test.duplicated().sum()

0

In [20]:
train.dtypes

ID                 object
TARGET              int64
성별                 object
차량 소유 여부            int64
부동산 소유 여부           int64
자녀 수                int64
연간 수입             float64
수입 유형              object
최종 학력              object
결혼 여부              object
주거 형태              object
거주지 인구 비율         float64
휴대전화 소유 여부          int64
업무용 휴대전화 소유 여부      int64
이메일 소유 여부           int64
직업                 object
가족 구성원 수          float64
산업군                object
나이                  int64
근속연수                int64
가입연수              float64
dtype: object

In [52]:
df_x = pd.concat([train_x, test_x])
df_x.shape

(100000, 19)

In [53]:
from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()

categorical = ['성별', '수입 유형', '최종 학력', '결혼 여부', '주거 형태', '직업', '산업군']

for cat in categorical:
    df_x[cat] = le.fit_transform(df_x[cat].values)

In [54]:
train_x = df_x[:60000]
test_x = df_x[60000:]

In [19]:
train_y.value_counts()  # Imbalanced Dataset 

TARGET
0    53572
1     6428
Name: count, dtype: int64

# Oversampling

## SMOTE
minority class에 속한 $i$번째 observation의 feature variable $x_i$에 대해 **knn**(k-nearest neighbors) set $S_i$ 생성  
k개의 neighbors는 모두 minority class에 속해 있음  
$x_{syn} = x_i + λ(x_k - x_i), x_k ∈ S_i$  
($x_k$는 $S_i$에서 임의로 추출, λ는 0~1의 uniform distribution에서 추출)  
=> 소수 클래스에 속한 모든 observation에 대해 반복

![nn](/Users/seungju/Desktop/smote.png)

## ADASYN
SMOTE와 동일하지만, minority class에 있는 각 $x_i$에 대응하여 생성된 합성표본수를 $S_i$ 안에 포함된 majority class의 표본 수에 비례하도록 추출한 것이 차이

![nn](/Users/seungju/Desktop/adasyn.png)

In [55]:
from imblearn.over_sampling import RandomOverSampler, SMOTE, ADASYN

oversampler = RandomOverSampler(random_state=42)
smote = SMOTE(random_state=42)
adasyn = ADASYN(random_state=42)

train_x_oversampled, train_y_oversampled = oversampler.fit_resample(train_x, train_y)
train_x_smote, train_y_smote = smote.fit_resample(train_x, train_y)
train_x_adasyn, train_y_adasyn = adasyn.fit_resample(train_x, train_y)

  if not hasattr(array, "sparse") and array.dtypes.apply(is_sparse).any():
  if not hasattr(array, "sparse") and array.dtypes.apply(is_sparse).any():
  if not hasattr(array, "sparse") and array.dtypes.apply(is_sparse).any():


**original**: *train_x*, *train_y*  
**random**: *train_x_oversampled*, *train_y_oversampled*  
**smote**: *train_x_smote*, *train_y_smote*  
**adasyn**: *train_x_adasyn*, *train_y_adasyn*  

In [56]:
from catboost import Pool, CatBoostClassifier
train_dataset = Pool(data=train_x, label=train_y)
cat = CatBoostClassifier(eval_metric='Accuracy')
cat.fit(train_dataset, use_best_model=True)

You should provide test set for use best model. use_best_model parameter has been switched to false value.


Learning rate set to 0.059184
0:	learn: 0.8928667	total: 65.7ms	remaining: 1m 5s
1:	learn: 0.8928667	total: 76.6ms	remaining: 38.2s
2:	learn: 0.8928667	total: 86.1ms	remaining: 28.6s
3:	learn: 0.8928667	total: 94.7ms	remaining: 23.6s
4:	learn: 0.8928667	total: 104ms	remaining: 20.7s
5:	learn: 0.8928667	total: 113ms	remaining: 18.8s
6:	learn: 0.8928667	total: 122ms	remaining: 17.3s
7:	learn: 0.8928667	total: 131ms	remaining: 16.2s
8:	learn: 0.8928667	total: 139ms	remaining: 15.4s
9:	learn: 0.8928667	total: 149ms	remaining: 14.8s
10:	learn: 0.8928667	total: 159ms	remaining: 14.3s
11:	learn: 0.8928667	total: 169ms	remaining: 13.9s
12:	learn: 0.8928667	total: 179ms	remaining: 13.6s
13:	learn: 0.8928667	total: 193ms	remaining: 13.6s
14:	learn: 0.8928667	total: 204ms	remaining: 13.4s
15:	learn: 0.8928667	total: 213ms	remaining: 13.1s
16:	learn: 0.8928667	total: 223ms	remaining: 12.9s
17:	learn: 0.8928667	total: 236ms	remaining: 12.8s
18:	learn: 0.8928667	total: 255ms	remaining: 13.2s
19:	lea

168:	learn: 0.8929667	total: 1.58s	remaining: 7.79s
169:	learn: 0.8929667	total: 1.6s	remaining: 7.79s
170:	learn: 0.8929667	total: 1.61s	remaining: 7.79s
171:	learn: 0.8929667	total: 1.62s	remaining: 7.8s
172:	learn: 0.8929667	total: 1.64s	remaining: 7.82s
173:	learn: 0.8929667	total: 1.65s	remaining: 7.82s
174:	learn: 0.8929833	total: 1.66s	remaining: 7.82s
175:	learn: 0.8929833	total: 1.67s	remaining: 7.81s
176:	learn: 0.8929833	total: 1.68s	remaining: 7.8s
177:	learn: 0.8929833	total: 1.69s	remaining: 7.79s
178:	learn: 0.8929833	total: 1.7s	remaining: 7.79s
179:	learn: 0.8930000	total: 1.71s	remaining: 7.78s
180:	learn: 0.8930000	total: 1.72s	remaining: 7.76s
181:	learn: 0.8929833	total: 1.73s	remaining: 7.75s
182:	learn: 0.8930000	total: 1.74s	remaining: 7.75s
183:	learn: 0.8930000	total: 1.75s	remaining: 7.74s
184:	learn: 0.8930000	total: 1.75s	remaining: 7.73s
185:	learn: 0.8929833	total: 1.76s	remaining: 7.73s
186:	learn: 0.8930333	total: 1.77s	remaining: 7.71s
187:	learn: 0.89

339:	learn: 0.8936167	total: 3.18s	remaining: 6.17s
340:	learn: 0.8936167	total: 3.19s	remaining: 6.16s
341:	learn: 0.8936167	total: 3.2s	remaining: 6.15s
342:	learn: 0.8936167	total: 3.21s	remaining: 6.15s
343:	learn: 0.8936333	total: 3.22s	remaining: 6.14s
344:	learn: 0.8936333	total: 3.23s	remaining: 6.13s
345:	learn: 0.8936333	total: 3.24s	remaining: 6.12s
346:	learn: 0.8936500	total: 3.25s	remaining: 6.11s
347:	learn: 0.8936667	total: 3.25s	remaining: 6.1s
348:	learn: 0.8936667	total: 3.26s	remaining: 6.08s
349:	learn: 0.8936667	total: 3.27s	remaining: 6.08s
350:	learn: 0.8936667	total: 3.28s	remaining: 6.07s
351:	learn: 0.8937000	total: 3.29s	remaining: 6.05s
352:	learn: 0.8937000	total: 3.3s	remaining: 6.04s
353:	learn: 0.8937000	total: 3.31s	remaining: 6.03s
354:	learn: 0.8937000	total: 3.31s	remaining: 6.02s
355:	learn: 0.8937167	total: 3.32s	remaining: 6.01s
356:	learn: 0.8937167	total: 3.33s	remaining: 6s
357:	learn: 0.8937167	total: 3.34s	remaining: 5.99s
358:	learn: 0.8937

502:	learn: 0.8943333	total: 4.75s	remaining: 4.7s
503:	learn: 0.8943333	total: 4.77s	remaining: 4.69s
504:	learn: 0.8943333	total: 4.78s	remaining: 4.68s
505:	learn: 0.8943333	total: 4.78s	remaining: 4.67s
506:	learn: 0.8943333	total: 4.8s	remaining: 4.67s
507:	learn: 0.8943333	total: 4.81s	remaining: 4.66s
508:	learn: 0.8943333	total: 4.82s	remaining: 4.65s
509:	learn: 0.8943333	total: 4.83s	remaining: 4.64s
510:	learn: 0.8943333	total: 4.83s	remaining: 4.63s
511:	learn: 0.8943167	total: 4.84s	remaining: 4.62s
512:	learn: 0.8943500	total: 4.85s	remaining: 4.61s
513:	learn: 0.8943500	total: 4.86s	remaining: 4.59s
514:	learn: 0.8943667	total: 4.87s	remaining: 4.58s
515:	learn: 0.8943667	total: 4.87s	remaining: 4.57s
516:	learn: 0.8943667	total: 4.88s	remaining: 4.56s
517:	learn: 0.8943667	total: 4.89s	remaining: 4.55s
518:	learn: 0.8943667	total: 4.9s	remaining: 4.54s
519:	learn: 0.8943667	total: 4.91s	remaining: 4.53s
520:	learn: 0.8943500	total: 4.91s	remaining: 4.52s
521:	learn: 0.8

663:	learn: 0.8950333	total: 6.15s	remaining: 3.11s
664:	learn: 0.8950167	total: 6.16s	remaining: 3.1s
665:	learn: 0.8950667	total: 6.16s	remaining: 3.09s
666:	learn: 0.8950667	total: 6.17s	remaining: 3.08s
667:	learn: 0.8950667	total: 6.18s	remaining: 3.07s
668:	learn: 0.8950667	total: 6.19s	remaining: 3.06s
669:	learn: 0.8951000	total: 6.2s	remaining: 3.05s
670:	learn: 0.8951000	total: 6.21s	remaining: 3.04s
671:	learn: 0.8951500	total: 6.22s	remaining: 3.03s
672:	learn: 0.8951500	total: 6.23s	remaining: 3.03s
673:	learn: 0.8951500	total: 6.24s	remaining: 3.02s
674:	learn: 0.8951667	total: 6.25s	remaining: 3.01s
675:	learn: 0.8951667	total: 6.25s	remaining: 3s
676:	learn: 0.8952000	total: 6.26s	remaining: 2.99s
677:	learn: 0.8952167	total: 6.27s	remaining: 2.98s
678:	learn: 0.8952167	total: 6.28s	remaining: 2.97s
679:	learn: 0.8952167	total: 6.29s	remaining: 2.96s
680:	learn: 0.8952167	total: 6.3s	remaining: 2.95s
681:	learn: 0.8952500	total: 6.31s	remaining: 2.94s
682:	learn: 0.8952

830:	learn: 0.8962333	total: 7.54s	remaining: 1.53s
831:	learn: 0.8962500	total: 7.55s	remaining: 1.52s
832:	learn: 0.8962833	total: 7.56s	remaining: 1.52s
833:	learn: 0.8962667	total: 7.57s	remaining: 1.51s
834:	learn: 0.8962833	total: 7.58s	remaining: 1.5s
835:	learn: 0.8962833	total: 7.59s	remaining: 1.49s
836:	learn: 0.8962667	total: 7.6s	remaining: 1.48s
837:	learn: 0.8962667	total: 7.61s	remaining: 1.47s
838:	learn: 0.8962833	total: 7.62s	remaining: 1.46s
839:	learn: 0.8963000	total: 7.63s	remaining: 1.45s
840:	learn: 0.8963000	total: 7.63s	remaining: 1.44s
841:	learn: 0.8963000	total: 7.64s	remaining: 1.43s
842:	learn: 0.8963000	total: 7.65s	remaining: 1.43s
843:	learn: 0.8962833	total: 7.66s	remaining: 1.42s
844:	learn: 0.8962500	total: 7.67s	remaining: 1.41s
845:	learn: 0.8962667	total: 7.68s	remaining: 1.4s
846:	learn: 0.8962833	total: 7.69s	remaining: 1.39s
847:	learn: 0.8963000	total: 7.69s	remaining: 1.38s
848:	learn: 0.8963167	total: 7.7s	remaining: 1.37s
849:	learn: 0.89

997:	learn: 0.8974333	total: 9.14s	remaining: 18.3ms
998:	learn: 0.8974667	total: 9.15s	remaining: 9.16ms
999:	learn: 0.8974500	total: 9.16s	remaining: 0us


<catboost.core.CatBoostClassifier at 0x7f8375d60910>

In [64]:
sample_submission = pd.read_csv("data/sample_submission.csv")
catboost_submission = sample_submission.copy()
pred_probability = cat.predict_proba(test_x)
catboost_submission['TARGET'] = pred_probability[:,1]
catboost_submission
catboost_submission.to_csv('cat_last_proba.csv', index=False)

In [63]:
train_y.value_counts()

TARGET
0    53572
1     6428
Name: count, dtype: int64

In [65]:
pred_probability = pd.DataFrame(pred)
pred_probability.value_counts()

AttributeError: 'numpy.ndarray' object has no attribute 'value_counts'