In [201]:
import numpy as np
import pandas as pd

from sklearn.preprocessing import LabelEncoder, OneHotEncoder
from sklearn.feature_extraction import FeatureHasher
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score

import time
import os
import warnings
warnings.filterwarnings('ignore')
print(os.listdir('../input/cat-in-the-dat'))

['train.csv', 'test.csv', 'sample_submission.csv']


In [274]:
train = pd.read_csv('../input/cat-in-the-dat/train.csv')
test = pd.read_csv('../input/cat-in-the-dat/test.csv')
submission = pd.read_csv('../input/cat-in-the-dat/sample_submission.csv')

In [275]:
print("Train Dataset Shape: ", train.shape)
train.head()

Train Dataset Shape:  (300000, 25)


Unnamed: 0,id,bin_0,bin_1,bin_2,bin_3,bin_4,nom_0,nom_1,nom_2,nom_3,...,nom_9,ord_0,ord_1,ord_2,ord_3,ord_4,ord_5,day,month,target
0,0,0,0,0,T,Y,Green,Triangle,Snake,Finland,...,2f4cb3d51,2,Grandmaster,Cold,h,D,kr,2,2,0
1,1,0,1,0,T,Y,Green,Trapezoid,Hamster,Russia,...,f83c56c21,1,Grandmaster,Hot,a,A,bF,7,8,0
2,2,0,0,0,F,Y,Blue,Trapezoid,Lion,Russia,...,ae6800dd0,1,Expert,Lava Hot,h,R,Jc,7,2,0
3,3,0,1,0,F,Y,Red,Trapezoid,Snake,Canada,...,8270f0d71,1,Grandmaster,Boiling Hot,i,D,kW,2,1,1
4,4,0,0,0,F,N,Red,Trapezoid,Lion,Canada,...,b164b72a7,1,Grandmaster,Freezing,a,R,qP,7,8,0


- Binary Data: 두 가지 값으로만 구성된 변수.(1, 0), (Y, N)
- Categorical Data: 여러 가지 값으로 구성된 변수.(요일 -> (1,2,3,4,5,6,7))
- Ordinal Data: 순서를 가진 값으로 구성된 변수.(영화 평점 등)
- Nominal Data: 수치적인 중요성이 없는 값으로 구성된 변수.(이름, 직업)

이 커널에서는 Binary, Ordinal, Nominal 변수를 이용하여 Binary Classification을 수행합니다.

In [276]:
cols = train.columns
print("Number of Unique Values in Train Dataset Columns")

for col in cols:
    print("{} : ".format(col),train[col].nunique())

Number of Unique Values in Train Dataset Columns
id :  300000
bin_0 :  2
bin_1 :  2
bin_2 :  2
bin_3 :  2
bin_4 :  2
nom_0 :  3
nom_1 :  6
nom_2 :  6
nom_3 :  6
nom_4 :  4
nom_5 :  222
nom_6 :  522
nom_7 :  1220
nom_8 :  2215
nom_9 :  11981
ord_0 :  3
ord_1 :  5
ord_2 :  6
ord_3 :  15
ord_4 :  26
ord_5 :  192
day :  7
month :  12
target :  2


In [277]:
train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 300000 entries, 0 to 299999
Data columns (total 25 columns):
id        300000 non-null int64
bin_0     300000 non-null int64
bin_1     300000 non-null int64
bin_2     300000 non-null int64
bin_3     300000 non-null object
bin_4     300000 non-null object
nom_0     300000 non-null object
nom_1     300000 non-null object
nom_2     300000 non-null object
nom_3     300000 non-null object
nom_4     300000 non-null object
nom_5     300000 non-null object
nom_6     300000 non-null object
nom_7     300000 non-null object
nom_8     300000 non-null object
nom_9     300000 non-null object
ord_0     300000 non-null int64
ord_1     300000 non-null object
ord_2     300000 non-null object
ord_3     300000 non-null object
ord_4     300000 non-null object
ord_5     300000 non-null object
day       300000 non-null int64
month     300000 non-null int64
target    300000 non-null int64
dtypes: int64(8), object(17)
memory usage: 57.2+ MB


# Preprocessing

## Features(X), Label Split(y) 

In [278]:
X_train = train.drop(columns = ['id','target'])

int_cols = ['bin_0', 'bin_1', 'bin_2','ord_0', 'ord_2', 'day', 'month']
for col in int_cols:
    X_train[col] = X_train[col].astype(object)

y_train = train['target']

## Label Encoder
먼저, LabelEncoder 방법을 이용하여 이진 분류(Binary Classification) 문제를 해결합니다. 
- Label Encoder: 문자열로 구성된 카테고리 변수를 수치형으로 변환. 일반적인 방법은 아님.
- 이진 분류에서 가장 기본적으로 사용되는 Logistic Regression을 이용합니다. 

In [279]:
def Feature_LabelEncoder(features, label, train):
    label_encoder = LabelEncoder()
    
    X_train, X_val, y_train, y_val = train_test_split(features, label, test_size = 0.2, random_state =42)
    
    for col in X_train.columns:
        X_train[col] = label_encoder.fit_transform(X_train[col])
        X_val[col] = label_encoder.fit_transform(X_val[col])
        
    if train == True:
        print("Train Data Shape: {} Rows, {} Columns".format(X_train.shape[0], X_train.shape[1]))
        return X_train, y_train
    
    else:
        print("Train Data Shape: {} Rows, {} Columns".format(X_val.shape[0], X_val.shape[1]))
        return X_val, y_val

In [280]:
X_tr, y_tr = Feature_LabelEncoder(X_train, y_train, train = True)
X_vl, y_vl = Feature_LabelEncoder(X_train, y_train, train = False)

Train Data Shape: 240000 Rows, 23 Columns
Train Data Shape: 60000 Rows, 23 Columns


In [210]:
%%time

logistic_reg = LogisticRegression()
logistic_reg.fit(X_tr, y_tr)

lb_prediction = logistic_reg.predict(X_vl)

print("Accuracy: {}".format(accuracy_score(lb_prediction, y_vl)))

Accuracy: 0.69485
CPU times: user 14.5 s, sys: 9.39 s, total: 23.9 s
Wall time: 6.22 s


## One Hot Encoding
일반적인 One-Hot Encoding을 이용해 Logistic Regression을 먼저 수행합니다.

In [211]:
def Feature_OneHotEncoder(features, label, train):
    encoder = OneHotEncoder()
    
    encoder_train = encoder.fit_transform(features)
    
    X_train, X_val, y_train, y_val = train_test_split(encoder_train, label, test_size = 0.2, random_state = 42)
    
    if train == True:
        print("Train Data Shape: {} Rows, {} Columns".format(X_train.shape[0], X_train.shape[1]))
        return X_train, y_train
    
    else:
        print("Train Data Shape: {} Rows, {} Columns".format(X_val.shape[0], X_val.shape[1]))
        return X_val, y_val

In [212]:
X_tr, y_tr = Feature_OneHotEncoder(X_train, y_train, train = True)
X_vl ,y_vl = Feature_OneHotEncoder(X_train, y_train, train = False)

Train Data Shape: 240000 Rows, 16461 Columns
Train Data Shape: 60000 Rows, 16461 Columns


In [213]:
%%time

model = LogisticRegression()
model.fit(X_tr, y_tr)

ohe_prediction = model.predict(X_vl)

print("Accuracy: ", accuracy_score(y_vl, ohe_prediction))

Accuracy:  0.76035
CPU times: user 17.6 s, sys: 17.4 s, total: 35 s
Wall time: 8.88 s


## Feature Hashing

In [214]:
def Feature_Hashing(features, label, train):
    hashing = FeatureHasher(input_type = 'string')
    
    for col in features.columns:
        features[col] = features[col].astype('str')
    
    train_df = hashing.fit_transform(features.values)
    
    X_train, X_val, y_train, y_val = train_test_split(train_df, label, test_size = 0.2, random_state = 42)
    
    if train == True:
        print("Train Data Shape: {} Rows, {} Columns".format(X_train.shape[0], X_train.shape[1]))
        return X_train, y_train
    
    else:
        print("Validation Data Shape: {} Rows, {} Columns".format(X_val.shape[0], X_train.shape[1]))
        return X_val, y_val

In [215]:
X_tr, y_tr = Feature_Hashing(X_train, y_train, train = True)
X_vl, y_vl = Feature_Hashing(X_train, y_train, train = False)

Train Data Shape: 240000 Rows, 1048576 Columns
Validation Data Shape: 60000 Rows, 1048576 Columns


In [216]:
model = LogisticRegression(random_state = 42)
model.fit(X_tr, y_tr)

fh_prediction = model.predict(X_vl)

print("Accuracy: {}".format(accuracy_score(fh_prediction, y_vl)))

Accuracy: 0.7540666666666667


## Encoding Categories with Dataset Statistics
- 빈도수와 문자열의 유사도에 따라 비슷한 범주를 서로 가깝게 배치하는 인코딩 방법입니다.


In [251]:
def Feature_Stat(features, label, train):
    f = features.copy() # 왜 복사를 안하면 글로벌 인스턴스가 변경이 되는거지?
    
    for col in f.columns:
        f[col] = f[col].astype('category')
        counts = f[col].value_counts().sort_index().fillna(0)
        
        # 컬럼별 고유값 개수에 따른 Uniform 분포를 따르는 난수 생성 후, 각각 1000으로 나눈다. 그 후, 각 빈도수에 더한다.
        # 1000으로 나뉘는 이유는 같은 같은 빈도수가 등장하게 되면, 다른 범주여도 같은 범주로 인식을 할 수 있기 때문이다.
        # ex) 0과 1로 구성된 컬럼은 (2, ) 형태의 난수를 생성하고 각각을 1000으로 나눈후, 빈도에 더한다.
        counts += np.random.rand(len(counts))/1000
        
        f[col].cat.categories = counts
    
    X_tr, X_vl, y_tr, y_vl = train_test_split(features, label, test_size = 0.2, random_state = 42)

    if train == True:
        print("Train Data Shape: {} Rows, {} Columns".format(X_tr.shape[0], X_tr.shape[1]))
        return X_train, y_train
    
    else:
        print("Validation Data Shape: {} Rows, {} Columns".format(X_val.shape[0], X_val.shape[1]))
        return X_val, y_val

In [252]:
X_tr, y_tr = Feature_Stat(X_train, y_train, train = True)
X_vl, y_vl = Feature_Stat(X_train, y_train, train = False)

Train Data Shape: 240000 Rows, 23 Columns
Validation Data Shape: 90000 Rows, 24 Columns


In [227]:
model = LogisticRegression(random_state = 42)
model.fit(X_tr, y_tr)

stat_prediction = model.predict(X_vl)\

print("Accuracy: {}".format(accuracy_score(stat_prediction, y_vl)))

Accuracy: 0.6936166666666667


## Cyclic Feature
- 몇몇 변수는 순환형 값을 보입니다. 예를 들어, 월, 일, 시간이 존재합니다.
- Cosine 함수와 Sine 함수를 사용하여 이를 구현합니다.
- 순환형 변수를 제외하곤, 지금까지 가장 성능이 좋았던 One Hot Encoding 방법을 이용합니다.

In [270]:
def Feature_Cyclic(features, label, train):
    f = features.copy()
    cyclic_cols = ['day', 'month']
    
    for cyclic_col in cyclic_cols:
        f[cyclic_col] = f[cyclic_col].astype('float')
        
        f[cyclic_col + "_sin"] = np.sin((2 * np.pi * f[cyclic_col]) / max(f[cyclic_col]))
        f[cyclic_col + "_cos"] = np.cos((2 * np.pi * f[cyclic_col]) / max(f[cyclic_col]))
    
    f.drop(columns = cyclic_cols, inplace = True)
    
    encoder = OneHotEncoder()
    oh_f = encoder.fit_transform(f)
    
    X_train, X_val, y_train, y_val = train_test_split(oh_f, label, test_size = 0.2, random_state = 42)
    
    if train == True:
        print("Train Data Shape: {} Rows, {} Columns".format(X_train.shape[0], X_train.shape[1]))
        return X_train, y_train
    
    else:
        print("Validation Data Shape: {} Rows, {} Columns".format(X_val.shape[0], X_train.shape[1]))
        return X_val, y_val

In [271]:
X_tr, y_tr = Feature_Cyclic(X_train, y_train, train = True)
X_vl, y_vl = Feature_Cyclic(X_train, y_train, train = False)

Train Data Shape: 240000 Rows, 16478 Columns
Validation Data Shape: 60000 Rows, 16478 Columns


In [273]:
model = LogisticRegression()
model.fit(X_tr, y_tr)

cy_prediction = model.predict(X_vl)
print("Accuracy: {}".format(accuracy_score(cy_prediction, y_vl)))

Accuracy: 0.76145
