# Categorical Feature Encoding Challenge - Baseline

Tutorial link -> https://www.kaggle.com/code/werooring/ch7-baseline 

In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

/kaggle/input/cat-in-the-dat/sample_submission.csv
/kaggle/input/cat-in-the-dat/train.csv
/kaggle/input/cat-in-the-dat/test.csv


In [2]:
data_path = '/kaggle/input/cat-in-the-dat/'

train = pd.read_csv(data_path + 'train.csv', index_col='id')
test = pd.read_csv(data_path + 'test.csv', index_col='id')
submission = pd.read_csv(data_path + 'sample_submission.csv', index_col='id')

## Feature Engineering

**Combining Data**

In [3]:
all_data = pd.concat([train, test]) # combining train & test set
all_data = all_data.drop('target', axis=1) # drop target val
all_data

Unnamed: 0_level_0,bin_0,bin_1,bin_2,bin_3,bin_4,nom_0,nom_1,nom_2,nom_3,nom_4,...,nom_8,nom_9,ord_0,ord_1,ord_2,ord_3,ord_4,ord_5,day,month
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
0,0,0,0,T,Y,Green,Triangle,Snake,Finland,Bassoon,...,c389000ab,2f4cb3d51,2,Grandmaster,Cold,h,D,kr,2,2
1,0,1,0,T,Y,Green,Trapezoid,Hamster,Russia,Piano,...,4cd920251,f83c56c21,1,Grandmaster,Hot,a,A,bF,7,8
2,0,0,0,F,Y,Blue,Trapezoid,Lion,Russia,Theremin,...,de9c9f684,ae6800dd0,1,Expert,Lava Hot,h,R,Jc,7,2
3,0,1,0,F,Y,Red,Trapezoid,Snake,Canada,Oboe,...,4ade6ab69,8270f0d71,1,Grandmaster,Boiling Hot,i,D,kW,2,1
4,0,0,0,F,N,Red,Trapezoid,Lion,Canada,Oboe,...,cb43ab175,b164b72a7,1,Grandmaster,Freezing,a,R,qP,7,8
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
499995,0,0,0,F,N,Green,Square,Lion,Canada,Theremin,...,9e4b23160,acc31291f,1,Novice,Lava Hot,j,A,Gb,1,3
499996,1,0,0,F,Y,Green,Trapezoid,Lion,China,Piano,...,cfbd87ed0,eae3446d0,1,Contributor,Lava Hot,f,S,Ed,2,2
499997,0,1,1,T,Y,Green,Trapezoid,Lion,Canada,Oboe,...,1108bcd6c,33dd3cf4b,1,Novice,Boiling Hot,g,V,TR,3,1
499998,1,0,0,T,Y,Blue,Star,Hamster,Costa Rica,Bassoon,...,606ac930b,d4cf587dd,2,Grandmaster,Boiling Hot,g,X,Ye,2,1


**One Hot Encoding**
- transform categorical data into a format that can be provided to algorithms that require numerical input
- categorical data, such as names of cities, colors, or other non-numerical variables, needs to be converted into a numerical form before being used in models -> use one hot encoding method

In [4]:
from sklearn.preprocessing import OneHotEncoder

encoder = OneHotEncoder() # one-hot encoder
all_data_encoded = encoder.fit_transform(all_data) # apply one-hot encoding

**Separating Data**

In [5]:
num_train = len(train) 

# separate train & test data
X_train = all_data_encoded[:num_train] # 0 ~ (num_train - 1)row
X_test = all_data_encoded[num_train:] # num_train ~ last row

y = train['target']

In [6]:
from sklearn.model_selection import train_test_split

# separate train & validation data
X_train, X_valid, y_train, y_valid = train_test_split(X_train, y,
                                                      test_size=0.1,
                                                      stratify=y,
                                                      random_state=10)

## Training Data

In [7]:
from sklearn.linear_model import LogisticRegression

logistic_model = LogisticRegression(max_iter=1000, random_state=42) # create model
logistic_model.fit(X_train, y_train) # train model

## Validate Model Performance

- Logistic Regression 
    - reference link: https://www.youtube.com/watch?v=yIYKR4sgzI8

In [8]:
logistic_model.predict_proba(X_valid)

array([[0.2327445 , 0.7672555 ],
       [0.91407413, 0.08592587],
       [0.83020749, 0.16979251],
       ...,
       [0.24857411, 0.75142589],
       [0.49402796, 0.50597204],
       [0.95658053, 0.04341947]])

In [9]:
logistic_model.predict(X_valid)

array([1, 0, 0, ..., 1, 1, 0])

In [10]:
# predict target using validation data
y_valid_preds = logistic_model.predict_proba(X_valid)[:, 1]

In [11]:
from sklearn.metrics import roc_auc_score # ROC AUC 

# validation data ROC AUC
roc_auc = roc_auc_score(y_valid, y_valid_preds)

print(f'validation data ROC AUC : {roc_auc:.4f}')

validation data ROC AUC : 0.7965


In [12]:
# predict whether the target is 1
y_preds = logistic_model.predict_proba(X_test)[:, 1]

In [13]:
submission['target'] = y_preds
submission.to_csv('submission.csv')