In [2]:
import numpy as np
import pandas as pd

> # 데이터 살펴보기


`age`
- Feature	Integer

`job`
- Feature	Categorical
- Occupation	type of job (categorical: 'admin.','blue collar', 'entrepreneur', 'housemaid', 'management', 'retired','self-employed','services','student','technician','unemployed','unknown')

`marital`
- Feature	Categorical
- Marital Status(categorical: 'divorced','married','single','unknown'; note: 'divorced' means divorced or widowed)

`education`
- Feature	Categorical
- Education Level	(categorical:'basic.4y','basic.6y','basic.9y','high.school','illiterate','professional.course','university.degree','unknown')

`default`
- Feature	Binary; has credit in default?

`balance`
- Feature	Integer;average yearly balance	euros

`housing`
- Feature	Binary; has housing loan?

`loan`
- Feature	Binary		has personal loan?

`contact`
- Feature	Categorical
- contact communication type (categorical: 'cellular','telephone')

`day_of_week`
- Feature	Date
- last contact day of the week

`month`
- Feature	Date
- last contact month of year (categorical: 'jan', 'feb', 'mar', ..., 'nov', 'dec')

`duration`
- Feature	Integer
- last contact duration, in seconds (numeric). Important note: this attribute highly affects the output target (e.g., if duration=0 then y='no'). Yet, the duration is not known before a call is performed. Also, after the end of the call y is obviously known. Thus, this input should only be included for benchmark purposes and should be discarded if the intention is to have a realistic predictive model.

`campaign`
- Feature	Integer
- number of contacts performed during this campaign and for this client (numeric, includes last contact)

`pdays`
- Feature	Integer
- number of days that passed by after the client was last contacted from a previous campaign (numeric; -1 means client was not previously contacted)

`previous`
- Feature	Integer
number of contacts performed before this campaign and for this client

`poutcome`
- Feature	Categorical
- outcome of the previous marketing campaign (categorical: 'failure','nonexistent','success')

`y`
- Target
- Binary; has the client subscribed a term deposit?


In [3]:
data = pd.read_csv('bank-full.csv',sep=';')

In [4]:
data.head()

Unnamed: 0,age,job,marital,education,default,balance,housing,loan,contact,day,month,duration,campaign,pdays,previous,poutcome,y
0,58,management,married,tertiary,no,2143,yes,no,unknown,5,may,261,1,-1,0,unknown,no
1,44,technician,single,secondary,no,29,yes,no,unknown,5,may,151,1,-1,0,unknown,no
2,33,entrepreneur,married,secondary,no,2,yes,yes,unknown,5,may,76,1,-1,0,unknown,no
3,47,blue-collar,married,unknown,no,1506,yes,no,unknown,5,may,92,1,-1,0,unknown,no
4,33,unknown,single,unknown,no,1,no,no,unknown,5,may,198,1,-1,0,unknown,no


In [5]:
data.shape

(45211, 17)

In [6]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 45211 entries, 0 to 45210
Data columns (total 17 columns):
 #   Column     Non-Null Count  Dtype 
---  ------     --------------  ----- 
 0   age        45211 non-null  int64 
 1   job        45211 non-null  object
 2   marital    45211 non-null  object
 3   education  45211 non-null  object
 4   default    45211 non-null  object
 5   balance    45211 non-null  int64 
 6   housing    45211 non-null  object
 7   loan       45211 non-null  object
 8   contact    45211 non-null  object
 9   day        45211 non-null  int64 
 10  month      45211 non-null  object
 11  duration   45211 non-null  int64 
 12  campaign   45211 non-null  int64 
 13  pdays      45211 non-null  int64 
 14  previous   45211 non-null  int64 
 15  poutcome   45211 non-null  object
 16  y          45211 non-null  object
dtypes: int64(7), object(10)
memory usage: 5.9+ MB


In [7]:
data.isnull().sum()

age          0
job          0
marital      0
education    0
default      0
balance      0
housing      0
loan         0
contact      0
day          0
month        0
duration     0
campaign     0
pdays        0
previous     0
poutcome     0
y            0
dtype: int64

In [8]:
data[data["age"]<=0]

Unnamed: 0,age,job,marital,education,default,balance,housing,loan,contact,day,month,duration,campaign,pdays,previous,poutcome,y


In [9]:
# 불균형 데이터이다.
data.y.value_counts(normalize=True)

y
no     0.883015
yes    0.116985
Name: proportion, dtype: float64

> # imbalanced binary data classification

## a. feature engineering

1. onehotencoding

2. label encoder

3. cat2vec

### a-1 onehotencoding

In [10]:
cat = ['job', 'marital', 'education', 'default', 'housing', 'loan', 'contact', 'month', 'poutcome']

In [11]:
onehot = pd.get_dummies(data[cat], columns=cat, dtype=int)
onehot.reset_index(drop=True, inplace=True)

In [12]:
onehot

Unnamed: 0,job_admin.,job_blue-collar,job_entrepreneur,job_housemaid,job_management,job_retired,job_self-employed,job_services,job_student,job_technician,...,month_jun,month_mar,month_may,month_nov,month_oct,month_sep,poutcome_failure,poutcome_other,poutcome_success,poutcome_unknown
0,0,0,0,0,1,0,0,0,0,0,...,0,0,1,0,0,0,0,0,0,1
1,0,0,0,0,0,0,0,0,0,1,...,0,0,1,0,0,0,0,0,0,1
2,0,0,1,0,0,0,0,0,0,0,...,0,0,1,0,0,0,0,0,0,1
3,0,1,0,0,0,0,0,0,0,0,...,0,0,1,0,0,0,0,0,0,1
4,0,0,0,0,0,0,0,0,0,0,...,0,0,1,0,0,0,0,0,0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
45206,0,0,0,0,0,0,0,0,0,1,...,0,0,0,1,0,0,0,0,0,1
45207,0,0,0,0,0,1,0,0,0,0,...,0,0,0,1,0,0,0,0,0,1
45208,0,0,0,0,0,1,0,0,0,0,...,0,0,0,1,0,0,0,0,1,0
45209,0,1,0,0,0,0,0,0,0,0,...,0,0,0,1,0,0,0,0,0,1


In [13]:
# 인코딩된 데이터와 원래 데이터 합치기
data_onehot = pd.concat([data.drop(columns=cat), onehot], axis=1)

In [14]:
data_onehot.isna().sum()

age                    0
balance                0
day                    0
duration               0
campaign               0
pdays                  0
previous               0
y                      0
job_admin.             0
job_blue-collar        0
job_entrepreneur       0
job_housemaid          0
job_management         0
job_retired            0
job_self-employed      0
job_services           0
job_student            0
job_technician         0
job_unemployed         0
job_unknown            0
marital_divorced       0
marital_married        0
marital_single         0
education_primary      0
education_secondary    0
education_tertiary     0
education_unknown      0
default_no             0
default_yes            0
housing_no             0
housing_yes            0
loan_no                0
loan_yes               0
contact_cellular       0
contact_telephone      0
contact_unknown        0
month_apr              0
month_aug              0
month_dec              0
month_feb              0


In [15]:
data_onehot.to_csv("data_onehot.csv")

### a-2. label encoder

In [16]:
cat = ['job', 'marital', 'education', 'default', 'housing', 'loan', 'contact', 'month', 'poutcome']

In [17]:
from sklearn import preprocessing

def encode_features(dataDF, categorical_features):
    label_encoders = {}
    for cat in categorical_features:
        le = preprocessing.LabelEncoder()
        le.fit(dataDF[cat])
        dataDF[cat] = le.transform(dataDF[cat])
        label_encoders[cat] = le
    return dataDF, label_encoders

In [18]:
data_label , label_encoders = encode_features(data, cat)

In [19]:
data_label.head()

Unnamed: 0,age,job,marital,education,default,balance,housing,loan,contact,day,month,duration,campaign,pdays,previous,poutcome,y
0,58,4,1,2,0,2143,1,0,2,5,8,261,1,-1,0,3,no
1,44,9,2,1,0,29,1,0,2,5,8,151,1,-1,0,3,no
2,33,2,1,1,0,2,1,1,2,5,8,76,1,-1,0,3,no
3,47,1,1,3,0,1506,1,0,2,5,8,92,1,-1,0,3,no
4,33,11,2,3,0,1,0,0,2,5,8,198,1,-1,0,3,no


In [20]:
for feature, encoder in label_encoders.items():
    print(f"\n{feature}의 레이블 인코더:")
    for label, encoded_value in zip(encoder.classes_, encoder.transform(encoder.classes_)):
        print(f"{label} -> {encoded_value}")


job의 레이블 인코더:
admin. -> 0
blue-collar -> 1
entrepreneur -> 2
housemaid -> 3
management -> 4
retired -> 5
self-employed -> 6
services -> 7
student -> 8
technician -> 9
unemployed -> 10
unknown -> 11

marital의 레이블 인코더:
divorced -> 0
married -> 1
single -> 2

education의 레이블 인코더:
primary -> 0
secondary -> 1
tertiary -> 2
unknown -> 3

default의 레이블 인코더:
no -> 0
yes -> 1

housing의 레이블 인코더:
no -> 0
yes -> 1

loan의 레이블 인코더:
no -> 0
yes -> 1

contact의 레이블 인코더:
cellular -> 0
telephone -> 1
unknown -> 2

month의 레이블 인코더:
apr -> 0
aug -> 1
dec -> 2
feb -> 3
jan -> 4
jul -> 5
jun -> 6
mar -> 7
may -> 8
nov -> 9
oct -> 10
sep -> 11

poutcome의 레이블 인코더:
failure -> 0
other -> 1
success -> 2
unknown -> 3


In [21]:
data_label.to_csv("data_label.csv")

### a-3 cat2vec

In [22]:
data=pd.read_csv('bank-full.csv',sep=';')

In [23]:
data

Unnamed: 0,age,job,marital,education,default,balance,housing,loan,contact,day,month,duration,campaign,pdays,previous,poutcome,y
0,58,management,married,tertiary,no,2143,yes,no,unknown,5,may,261,1,-1,0,unknown,no
1,44,technician,single,secondary,no,29,yes,no,unknown,5,may,151,1,-1,0,unknown,no
2,33,entrepreneur,married,secondary,no,2,yes,yes,unknown,5,may,76,1,-1,0,unknown,no
3,47,blue-collar,married,unknown,no,1506,yes,no,unknown,5,may,92,1,-1,0,unknown,no
4,33,unknown,single,unknown,no,1,no,no,unknown,5,may,198,1,-1,0,unknown,no
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
45206,51,technician,married,tertiary,no,825,no,no,cellular,17,nov,977,3,-1,0,unknown,yes
45207,71,retired,divorced,primary,no,1729,no,no,cellular,17,nov,456,2,-1,0,unknown,yes
45208,72,retired,married,secondary,no,5715,no,no,cellular,17,nov,1127,5,184,3,success,yes
45209,57,blue-collar,married,secondary,no,668,no,no,telephone,17,nov,508,4,-1,0,unknown,no


In [24]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 45211 entries, 0 to 45210
Data columns (total 17 columns):
 #   Column     Non-Null Count  Dtype 
---  ------     --------------  ----- 
 0   age        45211 non-null  int64 
 1   job        45211 non-null  object
 2   marital    45211 non-null  object
 3   education  45211 non-null  object
 4   default    45211 non-null  object
 5   balance    45211 non-null  int64 
 6   housing    45211 non-null  object
 7   loan       45211 non-null  object
 8   contact    45211 non-null  object
 9   day        45211 non-null  int64 
 10  month      45211 non-null  object
 11  duration   45211 non-null  int64 
 12  campaign   45211 non-null  int64 
 13  pdays      45211 non-null  int64 
 14  previous   45211 non-null  int64 
 15  poutcome   45211 non-null  object
 16  y          45211 non-null  object
dtypes: int64(7), object(10)
memory usage: 5.9+ MB


In [25]:
cat = ['job', 'marital', 'education', 'default', 'housing', 'loan', 'contact', 'month', 'poutcome']

In [26]:
pip install hypertools



In [27]:
import gc, copy
from gensim.models import Word2Vec # categorical feature to vectors
from random import shuffle
import hypertools as hyp
import warnings
warnings.filterwarnings('ignore')

%matplotlib inline

In [28]:
import copy
import numpy as np
from random import shuffle
from gensim.models import Word2Vec

In [29]:
import copy
from random import shuffle
from gensim.models import Word2Vec

def apply_w2v(sentences, model, num_features):
    def _average_word_vectors(words, model, vocabulary, num_features):
        feature_vector = np.zeros((num_features,), dtype="float64")
        n_words = 0.
        for word in words:
            if word in vocabulary:
                n_words = n_words + 1.
                feature_vector = np.add(feature_vector, model.wv[word])

        if n_words:
            feature_vector = np.divide(feature_vector, n_words)
        return feature_vector

    vocab = set(model.wv.index_to_key)
    feats = [_average_word_vectors(s, model, vocab, num_features) for s in sentences]
    return np.array(feats)

def gen_cat2vec_sentences(data):
    X_w2v = copy.deepcopy(data)
    names = list(X_w2v.columns.values)
    for c in names:
        X_w2v[c] = X_w2v[c].astype('category')
        new_categories = [c + " " + str(category) for category in X_w2v[c].cat.categories]
        new_series = np.array([new_categories[X_w2v[c].cat.codes[i]] for i in range(len(X_w2v[c]))])
        X_w2v[c] = new_series
    X_w2v = X_w2v.values.tolist()
    return X_w2v

def fit_cat2vec_model(data, n_cat2vec_feature=100, n_cat2vec_window=5):
    X_w2v = gen_cat2vec_sentences(data)
    model = Word2Vec(sentences=X_w2v, vector_size=n_cat2vec_feature, window=n_cat2vec_window)
    return model

In [30]:
n_cat2vec_feature  = len(cat) # define the cat2vecs dimentions

n_cat2vec_window   = len(cat) * 2 # define the w2v window size

In [31]:
c2v_model = fit_cat2vec_model(data,n_cat2vec_feature,n_cat2vec_window)

c2v_matrix = apply_w2v(gen_cat2vec_sentences(data.loc[:, cat]), c2v_model, n_cat2vec_feature)

In [32]:
c2v_matrix

array([[ 0.76608648,  0.44538458,  1.17820963, ...,  0.04872833,
         4.98245976,  3.23220949],
       [ 0.83068692,  0.35931898,  1.04830921, ...,  0.10672241,
         5.04996343,  3.20598378],
       [ 0.71505004,  0.51889535,  1.32987624, ..., -0.07188911,
         4.93097097,  3.37215963],
       ...,
       [-0.11316153,  0.1056112 ,  3.34436724, ..., -0.49473509,
         2.75993475,  1.55182172],
       [ 0.41155757,  0.76956425,  2.15345372, ..., -0.54803931,
         4.25615939,  2.67151266],
       [ 0.32062036,  0.19749756,  3.01662342, ..., -0.24835872,
         3.46815803,  1.61601592]])

In [33]:
cat_vectors_df = pd.DataFrame(c2v_matrix,  columns=[f"{col}_vector" for col in cat])
print(cat_vectors_df)

       job_vector  marital_vector  education_vector  default_vector  \
0        0.766086        0.445385          1.178210       -0.466644   
1        0.830687        0.359319          1.048309       -0.510102   
2        0.715050        0.518895          1.329876       -0.553933   
3        0.774036        0.561776          1.388103       -0.611898   
4        0.760645        0.572970          1.247120       -0.682838   
...           ...             ...               ...             ...   
45206    0.500960        0.527124          1.962865       -0.943226   
45207   -0.041685        0.686169          2.588897       -0.937290   
45208   -0.113162        0.105611          3.344367       -2.148050   
45209    0.411558        0.769564          2.153454       -1.103580   
45210    0.320620        0.197498          3.016623       -2.277741   

       housing_vector  loan_vector  contact_vector  month_vector  \
0           -0.297587     2.953416        0.048728      4.982460   
1          

In [34]:
numeric_df = data.drop(cat, axis=1)

In [35]:
data_cat2vec = pd.concat([cat_vectors_df, numeric_df], axis=1)

In [36]:
data_cat2vec

Unnamed: 0,job_vector,marital_vector,education_vector,default_vector,housing_vector,loan_vector,contact_vector,month_vector,poutcome_vector,age,balance,day,duration,campaign,pdays,previous,y
0,0.766086,0.445385,1.178210,-0.466644,-0.297587,2.953416,0.048728,4.982460,3.232209,58,2143,5,261,1,-1,0,no
1,0.830687,0.359319,1.048309,-0.510102,-0.326991,3.202834,0.106722,5.049963,3.205984,44,29,5,151,1,-1,0,no
2,0.715050,0.518895,1.329876,-0.553933,-0.485803,2.731439,-0.071889,4.930971,3.372160,33,2,5,76,1,-1,0,no
3,0.774036,0.561776,1.388103,-0.611898,-0.461987,2.683202,-0.136928,5.019955,3.454538,47,1506,5,92,1,-1,0,no
4,0.760645,0.572970,1.247120,-0.682838,-0.243407,2.770083,-0.249371,5.006390,3.080830,33,1,5,198,1,-1,0,no
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
45206,0.500960,0.527124,1.962865,-0.943226,0.116902,2.551402,-0.267225,4.095250,2.404189,51,825,17,977,3,-1,0,yes
45207,-0.041685,0.686169,2.588897,-0.937290,-0.052967,1.470333,-0.807042,3.396741,2.559605,71,1729,17,456,2,-1,0,yes
45208,-0.113162,0.105611,3.344367,-2.148050,1.132395,1.662428,-0.494735,2.759935,1.551822,72,5715,17,1127,5,184,3,yes
45209,0.411558,0.769564,2.153454,-1.103580,0.099801,2.124844,-0.548039,4.256159,2.671513,57,668,17,508,4,-1,0,no


In [37]:
data_cat2vec.to_csv("data_cat2vec.csv")

> # 각기 다른 방식으로 전처리한 데이터를 각기 다른 방식으로 샘플링

- data_onehot
- data_label
- data_cat2vec

## b. sampling

1. no sampling

2. smote _ oversampling

3. ctgan_oversampling

In [38]:
from sklearn.metrics import confusion_matrix, accuracy_score, precision_score, recall_score, f1_score, roc_auc_score

def get_clf_eval(y_test, pred=None, pred_proba=None):
    confusion = confusion_matrix(y_test, pred)
    accuracy = accuracy_score(y_test, pred)
    precision = precision_score(y_test, pred, pos_label='yes')
    recall = recall_score(y_test, pred, pos_label='yes')
    f1 = f1_score(y_test, pred,  pos_label='yes')
    roc_auc = roc_auc_score(y_test, pred_proba)
    print("오차행렬")
    print(confusion)
    print("정확도:{0:.4f}, 정밀도:{1:.4f}, 재현율:{2:.4f}, F1:{3:.4f}, AUC:{4:.4f}".format(accuracy, precision, recall, f1, roc_auc))



> ### data_onehot

In [39]:
data_onehot

Unnamed: 0,age,balance,day,duration,campaign,pdays,previous,y,job_admin.,job_blue-collar,...,month_jun,month_mar,month_may,month_nov,month_oct,month_sep,poutcome_failure,poutcome_other,poutcome_success,poutcome_unknown
0,58,2143,5,261,1,-1,0,no,0,0,...,0,0,1,0,0,0,0,0,0,1
1,44,29,5,151,1,-1,0,no,0,0,...,0,0,1,0,0,0,0,0,0,1
2,33,2,5,76,1,-1,0,no,0,0,...,0,0,1,0,0,0,0,0,0,1
3,47,1506,5,92,1,-1,0,no,0,1,...,0,0,1,0,0,0,0,0,0,1
4,33,1,5,198,1,-1,0,no,0,0,...,0,0,1,0,0,0,0,0,0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
45206,51,825,17,977,3,-1,0,yes,0,0,...,0,0,0,1,0,0,0,0,0,1
45207,71,1729,17,456,2,-1,0,yes,0,0,...,0,0,0,1,0,0,0,0,0,1
45208,72,5715,17,1127,5,184,3,yes,0,0,...,0,0,0,1,0,0,0,0,1,0
45209,57,668,17,508,4,-1,0,no,0,1,...,0,0,0,1,0,0,0,0,0,1


> #### b-1. no sampling

In [40]:
from sklearn.model_selection import train_test_split

X = data_onehot.drop("y", axis=1)
y = data_onehot["y"]
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, stratify=y, random_state=42)

In [41]:
from sklearn.ensemble import RandomForestClassifier

rf_classifier = RandomForestClassifier(random_state=42)
rf_classifier.fit(X_train, y_train)

In [42]:
y_pred = rf_classifier.predict(X_test)
y_pred_proba = rf_classifier.predict_proba(X_test)[:, 1]

In [43]:
get_clf_eval(y_test, y_pred, y_pred_proba)

오차행렬
[[7758  227]
 [ 638  420]]
정확도:0.9043, 정밀도:0.6491, 재현율:0.3970, F1:0.4927, AUC:0.9264


> #### b-2. smote

In [44]:
from imblearn.over_sampling import SMOTE
smote = SMOTE(random_state=0)

X_train_over, y_train_over = smote.fit_resample(X_train, y_train)

In [45]:
y_train_over.value_counts()

y
no     31937
yes    31937
Name: count, dtype: int64

In [46]:
X_train

Unnamed: 0,age,balance,day,duration,campaign,pdays,previous,job_admin.,job_blue-collar,job_entrepreneur,...,month_jun,month_mar,month_may,month_nov,month_oct,month_sep,poutcome_failure,poutcome_other,poutcome_success,poutcome_unknown
24001,36,861,29,140,2,-1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1
43409,24,4126,5,907,4,185,7,0,0,0,...,0,0,0,0,0,0,1,0,0,0
20669,44,244,12,1735,4,-1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1
18810,48,0,31,35,11,-1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1
23130,38,257,26,57,10,-1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
17958,50,917,30,58,2,-1,0,0,1,0,...,0,0,0,0,0,0,0,0,0,1
15941,36,22,22,77,5,-1,0,1,0,0,...,0,0,0,0,0,0,0,0,0,1
16952,45,79,25,98,1,-1,0,0,1,0,...,0,0,0,0,0,0,0,0,0,1
34781,27,2559,6,227,1,-1,0,0,0,0,...,0,0,1,0,0,0,0,0,0,1


In [47]:
rf_classifier = RandomForestClassifier(random_state=42)
rf_classifier.fit(X_train_over, y_train_over)

In [48]:
y_pred = rf_classifier.predict(X_test)
y_pred_proba = rf_classifier.predict_proba(X_test)[:, 1]

In [49]:
get_clf_eval(y_test, y_pred, y_pred_proba)

오차행렬
[[7709  276]
 [ 577  481]]
정확도:0.9057, 정밀도:0.6354, 재현율:0.4546, F1:0.5300, AUC:0.9267


> #### b-3. ctgan

In [50]:
new=len(X_train_over)-len(X_train)

In [51]:
train = pd.concat([X_train, y_train], axis=1)

In [52]:
train

Unnamed: 0,age,balance,day,duration,campaign,pdays,previous,job_admin.,job_blue-collar,job_entrepreneur,...,month_mar,month_may,month_nov,month_oct,month_sep,poutcome_failure,poutcome_other,poutcome_success,poutcome_unknown,y
24001,36,861,29,140,2,-1,0,0,0,0,...,0,0,0,0,0,0,0,0,1,no
43409,24,4126,5,907,4,185,7,0,0,0,...,0,0,0,0,0,1,0,0,0,yes
20669,44,244,12,1735,4,-1,0,0,0,0,...,0,0,0,0,0,0,0,0,1,yes
18810,48,0,31,35,11,-1,0,0,0,0,...,0,0,0,0,0,0,0,0,1,no
23130,38,257,26,57,10,-1,0,0,0,0,...,0,0,0,0,0,0,0,0,1,no
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
17958,50,917,30,58,2,-1,0,0,1,0,...,0,0,0,0,0,0,0,0,1,no
15941,36,22,22,77,5,-1,0,1,0,0,...,0,0,0,0,0,0,0,0,1,no
16952,45,79,25,98,1,-1,0,0,1,0,...,0,0,0,0,0,0,0,0,1,no
34781,27,2559,6,227,1,-1,0,0,0,0,...,0,1,0,0,0,0,0,0,1,no


In [53]:
pip install ctgan

Collecting ctgan
  Downloading ctgan-0.9.1-py3-none-any.whl (24 kB)
Collecting rdt>=1.6.1 (from ctgan)
  Downloading rdt-1.10.1-py3-none-any.whl (61 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m61.9/61.9 kB[0m [31m3.9 MB/s[0m eta [36m0:00:00[0m
Collecting Faker<20,>=17 (from rdt>=1.6.1->ctgan)
  Downloading Faker-19.13.0-py3-none-any.whl (1.7 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.7/1.7 MB[0m [31m13.9 MB/s[0m eta [36m0:00:00[0m
Collecting nvidia-cuda-nvrtc-cu12==12.1.105 (from torch>=1.11.0->ctgan)
  Downloading nvidia_cuda_nvrtc_cu12-12.1.105-py3-none-manylinux1_x86_64.whl (23.7 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m23.7/23.7 MB[0m [31m35.0 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting nvidia-cuda-runtime-cu12==12.1.105 (from torch>=1.11.0->ctgan)
  Downloading nvidia_cuda_runtime_cu12-12.1.105-py3-none-manylinux1_x86_64.whl (823 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[

In [54]:
from ctgan import CTGAN

In [55]:
selected_columns = [col for col in data_onehot.columns if col not in ['age', 'balance', 'day', 'duration', 'campaign', 'pdays', 'previous']]

In [56]:
ctgan = CTGAN(epochs=10)
ctgan.fit(train, selected_columns)

In [57]:
synthetic_data = ctgan.sample(new)

In [58]:
synthetic_data

Unnamed: 0,age,balance,day,duration,campaign,pdays,previous,job_admin.,job_blue-collar,job_entrepreneur,...,month_mar,month_may,month_nov,month_oct,month_sep,poutcome_failure,poutcome_other,poutcome_success,poutcome_unknown,y
0,37,79,13,168,4,0,0,0,1,0,...,0,1,1,0,0,0,0,0,1,yes
1,55,3667,19,141,1,-1,0,0,0,0,...,0,0,0,0,0,0,0,0,1,yes
2,37,1195,19,66,1,0,0,0,0,0,...,0,1,1,0,0,0,0,0,1,yes
3,22,1269,24,229,1,0,0,0,0,0,...,0,1,0,0,0,0,0,0,1,no
4,53,595,28,97,3,77,0,0,0,0,...,0,0,0,0,0,0,0,0,1,no
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
27701,49,230,5,253,1,-2,2,0,0,0,...,0,1,0,0,0,0,0,0,1,yes
27702,62,2817,1,260,2,-1,0,1,0,0,...,0,1,0,1,0,0,0,1,1,no
27703,37,10703,20,61,3,-1,4,1,0,0,...,0,1,0,0,0,1,0,0,1,yes
27704,47,4014,3,410,1,-1,0,0,0,0,...,0,0,0,0,0,0,0,0,1,no


In [59]:
synthetic_data.y.value_counts(normalize=True)

y
no     0.749332
yes    0.250668
Name: proportion, dtype: float64

In [60]:
X_ctgan = synthetic_data.drop("y",axis=1)
y_ctgan = synthetic_data["y"]

In [61]:
ctgan_onehot_x = pd.concat([X_ctgan,X_train])

In [62]:
ctgan_onehot_y = pd.concat([y_ctgan, y_train])

In [63]:
rf_classifier = RandomForestClassifier(random_state=42)
rf_classifier.fit(ctgan_onehot_x, ctgan_onehot_y)

In [64]:
y_pred = rf_classifier.predict(X_test)
y_pred_proba = rf_classifier.predict_proba(X_test)[:, 1]

In [65]:
get_clf_eval(y_test, y_pred, y_pred_proba)

오차행렬
[[7806  179]
 [ 668  390]]
정확도:0.9063, 정밀도:0.6854, 재현율:0.3686, F1:0.4794, AUC:0.9236


> ### data_label

In [66]:
data_label

Unnamed: 0,age,job,marital,education,default,balance,housing,loan,contact,day,month,duration,campaign,pdays,previous,poutcome,y
0,58,4,1,2,0,2143,1,0,2,5,8,261,1,-1,0,3,no
1,44,9,2,1,0,29,1,0,2,5,8,151,1,-1,0,3,no
2,33,2,1,1,0,2,1,1,2,5,8,76,1,-1,0,3,no
3,47,1,1,3,0,1506,1,0,2,5,8,92,1,-1,0,3,no
4,33,11,2,3,0,1,0,0,2,5,8,198,1,-1,0,3,no
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
45206,51,9,1,2,0,825,0,0,0,17,9,977,3,-1,0,3,yes
45207,71,5,0,0,0,1729,0,0,0,17,9,456,2,-1,0,3,yes
45208,72,5,1,1,0,5715,0,0,0,17,9,1127,5,184,3,2,yes
45209,57,1,1,1,0,668,0,0,1,17,9,508,4,-1,0,3,no


>  #### b-1. no sampling

In [67]:
from sklearn.model_selection import train_test_split

X = data_label.drop("y", axis=1)
y = data_label["y"]
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, stratify=y, random_state=42)

In [68]:
from sklearn.ensemble import RandomForestClassifier

rf_classifier = RandomForestClassifier(random_state=42)
rf_classifier.fit(X_train, y_train)

In [69]:
y_pred = rf_classifier.predict(X_test)
y_pred_proba = rf_classifier.predict_proba(X_test)[:, 1]

In [70]:
get_clf_eval(y_test, y_pred, y_pred_proba)

오차행렬
[[7755  230]
 [ 617  441]]
정확도:0.9063, 정밀도:0.6572, 재현율:0.4168, F1:0.5101, AUC:0.9246


> #### b-2. smote

In [71]:
from imblearn.over_sampling import SMOTE
smote = SMOTE(random_state=0)

X_train_over, y_train_over = smote.fit_resample(X_train, y_train)

In [72]:
y_train_over.value_counts()

y
no     31937
yes    31937
Name: count, dtype: int64

In [73]:
rf_classifier = RandomForestClassifier(random_state=42)
rf_classifier.fit(X_train_over, y_train_over)

In [74]:
y_pred = rf_classifier.predict(X_test)
y_pred_proba = rf_classifier.predict_proba(X_test)[:, 1]

In [75]:
get_clf_eval(y_test, y_pred, y_pred_proba)

오차행렬
[[7291  694]
 [ 371  687]]
정확도:0.8822, 정밀도:0.4975, 재현율:0.6493, F1:0.5633, AUC:0.9108


> #### b-3. ctgan

In [76]:
selected_columns = [col for col in data_label.columns if col not in ['age', 'balance', 'day', 'duration', 'campaign', 'pdays', 'previous']]

In [77]:
selected_columns

['job',
 'marital',
 'education',
 'default',
 'housing',
 'loan',
 'contact',
 'month',
 'poutcome',
 'y']

In [78]:
new=len(X_train_over)-len(X_train)
train = pd.concat([X_train, y_train], axis=1)

ctgan = CTGAN(epochs=10)
ctgan.fit(train, selected_columns)

In [79]:
synthetic_data = ctgan.sample(new)

In [80]:
synthetic_data

Unnamed: 0,age,job,marital,education,default,balance,housing,loan,contact,day,month,duration,campaign,pdays,previous,poutcome,y
0,32,7,0,1,0,-561,0,1,0,27,5,147,2,-1,0,0,no
1,41,0,1,0,0,866,1,0,0,13,3,172,5,0,0,3,no
2,49,0,2,1,0,147,0,0,0,19,11,316,1,0,0,0,no
3,30,9,1,1,0,5802,1,0,0,2,11,243,4,1,0,2,no
4,46,0,1,1,0,-8,1,0,0,18,8,248,1,0,1,3,no
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
27701,38,1,0,1,0,-147,1,0,0,15,3,510,2,-1,0,3,no
27702,21,5,1,0,0,13459,1,0,1,27,8,164,2,0,0,2,no
27703,29,9,0,0,0,271,0,0,0,27,0,158,10,-1,0,0,no
27704,42,1,1,1,0,1182,0,0,0,25,5,1210,2,-2,0,0,no


In [81]:
synthetic_data.y.value_counts(normalize=True)

y
no     0.885332
yes    0.114668
Name: proportion, dtype: float64

In [82]:
X_ctgan = synthetic_data.drop("y",axis=1)
y_ctgan = synthetic_data["y"]

In [83]:
ctgan_label_x = pd.concat([X_ctgan,X_train])
ctgan_label_y = pd.concat([y_ctgan, y_train])

In [84]:
rf_classifier = RandomForestClassifier(random_state=42)
rf_classifier.fit(ctgan_label_x, ctgan_label_y)

In [85]:
y_pred = rf_classifier.predict(X_test)
y_pred_proba = rf_classifier.predict_proba(X_test)[:, 1]

In [86]:
get_clf_eval(y_test, y_pred, y_pred_proba)

오차행렬
[[7842  143]
 [ 747  311]]
정확도:0.9016, 정밀도:0.6850, 재현율:0.2940, F1:0.4114, AUC:0.9217


> ### data_cat2vec

In [87]:
data_cat2vec

Unnamed: 0,job_vector,marital_vector,education_vector,default_vector,housing_vector,loan_vector,contact_vector,month_vector,poutcome_vector,age,balance,day,duration,campaign,pdays,previous,y
0,0.766086,0.445385,1.178210,-0.466644,-0.297587,2.953416,0.048728,4.982460,3.232209,58,2143,5,261,1,-1,0,no
1,0.830687,0.359319,1.048309,-0.510102,-0.326991,3.202834,0.106722,5.049963,3.205984,44,29,5,151,1,-1,0,no
2,0.715050,0.518895,1.329876,-0.553933,-0.485803,2.731439,-0.071889,4.930971,3.372160,33,2,5,76,1,-1,0,no
3,0.774036,0.561776,1.388103,-0.611898,-0.461987,2.683202,-0.136928,5.019955,3.454538,47,1506,5,92,1,-1,0,no
4,0.760645,0.572970,1.247120,-0.682838,-0.243407,2.770083,-0.249371,5.006390,3.080830,33,1,5,198,1,-1,0,no
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
45206,0.500960,0.527124,1.962865,-0.943226,0.116902,2.551402,-0.267225,4.095250,2.404189,51,825,17,977,3,-1,0,yes
45207,-0.041685,0.686169,2.588897,-0.937290,-0.052967,1.470333,-0.807042,3.396741,2.559605,71,1729,17,456,2,-1,0,yes
45208,-0.113162,0.105611,3.344367,-2.148050,1.132395,1.662428,-0.494735,2.759935,1.551822,72,5715,17,1127,5,184,3,yes
45209,0.411558,0.769564,2.153454,-1.103580,0.099801,2.124844,-0.548039,4.256159,2.671513,57,668,17,508,4,-1,0,no


>  #### b-1. no sampling

In [88]:
from sklearn.model_selection import train_test_split

X = data_cat2vec.drop("y", axis=1)
y = data_cat2vec["y"]
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, stratify=y, random_state=42)

In [89]:
from sklearn.ensemble import RandomForestClassifier

rf_classifier = RandomForestClassifier(random_state=42)
rf_classifier.fit(X_train, y_train)

In [90]:
y_pred = rf_classifier.predict(X_test)
y_pred_proba = rf_classifier.predict_proba(X_test)[:, 1]

In [91]:
get_clf_eval(y_test, y_pred, y_pred_proba)

오차행렬
[[7736  249]
 [ 631  427]]
정확도:0.9027, 정밀도:0.6317, 재현율:0.4036, F1:0.4925, AUC:0.9190


> #### b-2. smote

In [92]:
from imblearn.over_sampling import SMOTE
smote = SMOTE(random_state=0)

X_train_over, y_train_over = smote.fit_resample(X_train, y_train)

In [93]:
y_train_over.value_counts()

y
no     31937
yes    31937
Name: count, dtype: int64

In [94]:
rf_classifier = RandomForestClassifier(random_state=42)
rf_classifier.fit(X_train_over, y_train_over)

In [95]:
y_pred = rf_classifier.predict(X_test)
y_pred_proba = rf_classifier.predict_proba(X_test)[:, 1]

In [96]:
get_clf_eval(y_test, y_pred, y_pred_proba)

오차행렬
[[7290  695]
 [ 354  704]]
정확도:0.8840, 정밀도:0.5032, 재현율:0.6654, F1:0.5731, AUC:0.9133


> #### b-3. ctgan

In [97]:
import pandas as pd
import numpy as np

In [98]:
data_cat2vec = pd.read_csv("data_cat2vec.csv",index_col=0)

In [99]:
data_cat2vec

Unnamed: 0,job_vector,marital_vector,education_vector,default_vector,housing_vector,loan_vector,contact_vector,month_vector,poutcome_vector,age,balance,day,duration,campaign,pdays,previous,y
0,0.766086,0.445385,1.178210,-0.466644,-0.297587,2.953416,0.048728,4.982460,3.232209,58,2143,5,261,1,-1,0,no
1,0.830687,0.359319,1.048309,-0.510102,-0.326991,3.202834,0.106722,5.049963,3.205984,44,29,5,151,1,-1,0,no
2,0.715050,0.518895,1.329876,-0.553933,-0.485803,2.731439,-0.071889,4.930971,3.372160,33,2,5,76,1,-1,0,no
3,0.774036,0.561776,1.388103,-0.611898,-0.461987,2.683202,-0.136928,5.019955,3.454538,47,1506,5,92,1,-1,0,no
4,0.760645,0.572970,1.247120,-0.682838,-0.243407,2.770083,-0.249371,5.006390,3.080830,33,1,5,198,1,-1,0,no
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
45206,0.500960,0.527124,1.962865,-0.943226,0.116902,2.551402,-0.267225,4.095250,2.404189,51,825,17,977,3,-1,0,yes
45207,-0.041685,0.686169,2.588897,-0.937290,-0.052967,1.470333,-0.807042,3.396741,2.559605,71,1729,17,456,2,-1,0,yes
45208,-0.113162,0.105611,3.344367,-2.148050,1.132395,1.662428,-0.494735,2.759935,1.551822,72,5715,17,1127,5,184,3,yes
45209,0.411558,0.769564,2.153454,-1.103580,0.099801,2.124844,-0.548039,4.256159,2.671513,57,668,17,508,4,-1,0,no


In [100]:
# selected_columns = [col for col in data_cat2vec.columns if col not in ['age', 'balance', 'day', 'duration', 'campaign', 'pdays', 'previous']]
selected_columns = ['y']

In [101]:
selected_columns

['y']

In [102]:
from ctgan import CTGAN

In [103]:
new=len(X_train_over)-len(X_train)
train = pd.concat([X_train, y_train], axis=1)

In [104]:
ctgan = CTGAN(epochs=10)
ctgan.fit(train, selected_columns)

In [105]:
synthetic_data = ctgan.sample(new)

In [106]:
synthetic_data

Unnamed: 0,job_vector,marital_vector,education_vector,default_vector,housing_vector,loan_vector,contact_vector,month_vector,poutcome_vector,age,balance,day,duration,campaign,pdays,previous,y
0,0.450007,0.385073,2.181801,-2.230952,-0.012082,2.891066,0.110962,4.142839,3.350159,64,855,11,276,1,-3,2,yes
1,0.607177,0.057367,1.678019,-0.613132,-0.165675,2.860392,-0.389752,3.497437,3.574485,40,-31,16,180,1,-3,0,no
2,0.525196,0.415609,2.626939,-1.007290,-0.279510,2.633582,0.074201,4.449693,1.842508,34,-19,12,380,1,-2,0,no
3,0.395273,-0.118823,1.613055,-0.621970,1.048574,2.582827,-0.301422,4.280116,3.173982,28,323,19,169,1,160,2,yes
4,0.680085,0.483482,3.525709,-1.148337,-0.419668,3.095214,-0.052186,3.590698,2.593548,27,3406,9,537,3,-3,0,no
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
27701,-0.065056,0.273248,2.477453,-0.861485,-0.301026,2.732224,-0.124200,3.284671,1.948703,35,-331,22,690,1,-3,0,yes
27702,0.533138,0.147418,2.792546,-2.257039,-0.400032,2.976777,-0.448410,3.269438,2.706058,38,-1031,12,1684,1,-2,0,yes
27703,0.571863,0.353128,2.057307,-0.957710,-0.391616,2.567093,-0.042959,4.856922,2.395622,59,309,10,331,1,-3,0,no
27704,0.616009,0.339360,2.187279,-2.410818,-0.196172,2.560426,-0.122757,4.371063,1.829530,39,4996,4,381,1,-3,3,yes


In [107]:
synthetic_data.y.value_counts(normalize=True)

y
no     0.550494
yes    0.449506
Name: proportion, dtype: float64

In [108]:
X_ctgan = synthetic_data.drop("y",axis=1)
y_ctgan = synthetic_data["y"]

In [109]:
ctgan_label_x = pd.concat([X_ctgan,X_train])
ctgan_label_y = pd.concat([y_ctgan, y_train])

In [110]:
rf_classifier = RandomForestClassifier(random_state=42)
rf_classifier.fit(ctgan_label_x, ctgan_label_y)

In [111]:
y_pred = rf_classifier.predict(X_test)
y_pred_proba = rf_classifier.predict_proba(X_test)[:, 1]

In [112]:
get_clf_eval(y_test, y_pred, y_pred_proba)

오차행렬
[[7742  243]
 [ 623  435]]
정확도:0.9042, 정밀도:0.6416, 재현율:0.4112, F1:0.5012, AUC:0.9162
