# Categorical Feature Encoding Challenge - Modeling

Tutorial Link -> https://www.kaggle.com/code/werooring/ch7-modeling

In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

/kaggle/input/cat-in-the-dat/sample_submission.csv
/kaggle/input/cat-in-the-dat/train.csv
/kaggle/input/cat-in-the-dat/test.csv


In [2]:
data_path = '/kaggle/input/cat-in-the-dat/'

train = pd.read_csv(data_path + 'train.csv', index_col='id')
test = pd.read_csv(data_path + 'test.csv', index_col='id')
submission = pd.read_csv(data_path + 'sample_submission.csv', index_col='id')

## Feature Engineering: Feature Encoding

**Combine Data**

In [3]:
all_data = pd.concat([train, test]) # combining train & test set
all_data = all_data.drop('target', axis=1) # drop target val

**Binary Feature Encoding**

In [4]:
all_data['bin_3'] = all_data['bin_3'].map({'F':0, 'T':1})
all_data['bin_4'] = all_data['bin_4'].map({'N':0, 'Y':1})

**Ordered Feature Encoding**

In [5]:
ord1dict = {'Novice':0, 'Contributor':1, 
            'Expert':2, 'Master':3, 'Grandmaster':4}
ord2dict = {'Freezing':0, 'Cold':1, 'Warm':2, 
            'Hot':3, 'Boiling Hot':4, 'Lava Hot':5}

all_data['ord_1'] = all_data['ord_1'].map(ord1dict)
all_data['ord_2'] = all_data['ord_2'].map(ord2dict)

In [6]:
from sklearn.preprocessing import OrdinalEncoder

ord_345 = ['ord_3', 'ord_4', 'ord_5']

ord_encoder = OrdinalEncoder() # OrdinalEncoder object
# apply ordinal encoding 
all_data[ord_345] = ord_encoder.fit_transform(all_data[ord_345])

# print encoding order per feature
for feature, categories in zip(ord_345, ord_encoder.categories_):
    print(feature)
    print(categories)

ord_3
['a' 'b' 'c' 'd' 'e' 'f' 'g' 'h' 'i' 'j' 'k' 'l' 'm' 'n' 'o']
ord_4
['A' 'B' 'C' 'D' 'E' 'F' 'G' 'H' 'I' 'J' 'K' 'L' 'M' 'N' 'O' 'P' 'Q' 'R'
 'S' 'T' 'U' 'V' 'W' 'X' 'Y' 'Z']
ord_5
['AP' 'Ai' 'Aj' 'BA' 'BE' 'Bb' 'Bd' 'Bn' 'CL' 'CM' 'CU' 'CZ' 'Cl' 'DH'
 'DN' 'Dc' 'Dx' 'Ed' 'Eg' 'Er' 'FI' 'Fd' 'Fo' 'GD' 'GJ' 'Gb' 'Gx' 'Hj'
 'IK' 'Id' 'JX' 'Jc' 'Jf' 'Jt' 'KR' 'KZ' 'Kf' 'Kq' 'LE' 'MC' 'MO' 'MV'
 'Mf' 'Ml' 'Mx' 'NV' 'Nf' 'Nk' 'OR' 'Ob' 'Os' 'PA' 'PQ' 'PZ' 'Ps' 'QM'
 'Qb' 'Qh' 'Qo' 'RG' 'RL' 'RP' 'Rm' 'Ry' 'SB' 'Sc' 'TR' 'TZ' 'To' 'UO'
 'Uk' 'Uu' 'Vf' 'Vx' 'WE' 'Wc' 'Wv' 'XI' 'Xh' 'Xi' 'YC' 'Yb' 'Ye' 'ZR'
 'ZS' 'Zc' 'Zq' 'aF' 'aM' 'aO' 'aP' 'ac' 'av' 'bF' 'bJ' 'be' 'cA' 'cG'
 'cW' 'ck' 'cp' 'dB' 'dE' 'dN' 'dO' 'dP' 'dQ' 'dZ' 'dh' 'eG' 'eQ' 'eb'
 'eg' 'ek' 'ex' 'fO' 'fh' 'gJ' 'gM' 'hL' 'hT' 'hh' 'hp' 'iT' 'ih' 'jS'
 'jV' 'je' 'jp' 'kC' 'kE' 'kK' 'kL' 'kU' 'kW' 'ke' 'kr' 'kw' 'lF' 'lL'
 'll' 'lx' 'mb' 'mc' 'mm' 'nX' 'nh' 'oC' 'oG' 'oH' 'oK' 'od' 'on' 'pa'
 'ps' 'qA' 'qJ' 'qK' 'qP' 'qX' '

**Nominal Feature Encoding**

In [7]:
nom_features = ['nom_' + str(i) for i in range(10)]

In [8]:
from sklearn.preprocessing import OneHotEncoder

onehot_encoder = OneHotEncoder() # OneHotEncoder object
# apply one-hot encoding
encoded_nom_matrix = onehot_encoder.fit_transform(all_data[nom_features])

encoded_nom_matrix

<Compressed Sparse Row sparse matrix of dtype 'float64'
	with 5000000 stored elements and shape (500000, 16276)>

In [9]:
all_data = all_data.drop(nom_features, axis = 1) # drop original nominal features

**Weather Feature Encoding**

In [10]:
date_features = ['day', 'month']

# apply one-hot encoding
encoded_date_matrix = onehot_encoder.fit_transform(all_data[date_features])

all_data = all_data.drop(date_features, axis = 1) # drop original nominal features

encoded_date_matrix

<Compressed Sparse Row sparse matrix of dtype 'float64'
	with 1000000 stored elements and shape (500000, 19)>

## Feature Engineering 2: Feature Scaling

**Ordered Feature Scaling**

In [11]:
from sklearn.preprocessing import MinMaxScaler

ord_features = ['ord_' + str(i) for i in range(6)]
# normalize min-max
all_data[ord_features] = MinMaxScaler().fit_transform(all_data[ord_features])

**Combining Encoded and Scaled Features**

In [12]:
from scipy import sparse

all_data_sprs = sparse.hstack([sparse.csr_matrix(all_data),
                               encoded_nom_matrix,
                               encoded_date_matrix],
                               format = 'csr')

In [13]:
all_data_sprs

<Compressed Sparse Row sparse matrix of dtype 'float64'
	with 9163718 stored elements and shape (500000, 16306)>

**Separating Data**

In [14]:
num_train = len(train)

# separate train & test data
X_train = all_data_sprs[:num_train] # 0 ~ (num_train - 1)row
X_test = all_data_sprs[num_train:] # num_train ~ last row

y = train['target']

In [15]:
from sklearn.model_selection import train_test_split

# separate train & validation data
X_train, X_valid, y_train, y_valid = train_test_split(X_train, y,
                                                      test_size=0.1,
                                                      stratify=y,
                                                      random_state=10)

## Optimizing Hyper Parameter

In [16]:
%%time

from sklearn.model_selection import GridSearchCV
from sklearn.linear_model import LogisticRegression

# create Logistic Regression model
logistic_model = LogisticRegression()

# hyper parameter vals
lr_params = {'C': [0.1, 0.125, 0.2], 'max_iter':[800, 900, 1000], 'solver':['liblinear'], 'random_state': [42]}

# create grid search object
gridsearch_logistic_model = GridSearchCV(estimator = logistic_model, param_grid = lr_params, scoring = 'roc_auc', cv = 5)

# perform grid search
gridsearch_logistic_model.fit(X_train, y_train)

print('Optimized hyper parameter:', gridsearch_logistic_model.best_params_)

Optimized hyper parameter: {'C': 0.125, 'max_iter': 800, 'random_state': 42, 'solver': 'liblinear'}
CPU times: user 29min 2s, sys: 4.37 s, total: 29min 6s
Wall time: 7min 37s


## Validating Model Performance

In [17]:
y_valid_preds = gridsearch_logistic_model.predict_proba(X_valid)[:, 1]

In [18]:
from sklearn.metrics import roc_auc_score 

# validation data ROC AUC
roc_auc = roc_auc_score(y_valid, y_valid_preds)

print(f'validation data ROC AUC: {roc_auc:.4f}' )

validation data ROC AUC: 0.8045


## Submitting Prediction and Result

- ROC AUC
    - reference link: https://www.youtube.com/watch?v=4jRBRDbJemM

In [19]:
# predict prob that target is 1
y_preds = gridsearch_logistic_model.best_estimator_.predict_proba(X_test)[:, 1]

submission['target'] = y_preds
submission.to_csv('submission.csv')