<a href="https://colab.research.google.com/github/RemyaRS/Feature-Engineering/blob/main/Converting_Categorical_Variables_into_Numerical.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

#Using google drive to import dataset

In [2]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


#Importing dataset from google drive

Source : https://www.kaggle.com/datasets/larsen0966/penguins

In [3]:
data = pd.read_csv("/content/drive/MyDrive/penguins.csv")
data.head()

Unnamed: 0.1,Unnamed: 0,species,island,bill_length_mm,bill_depth_mm,flipper_length_mm,body_mass_g,sex,year
0,1,Adelie,Torgersen,39.1,18.7,181.0,3750.0,male,2007
1,2,Adelie,Torgersen,39.5,17.4,186.0,3800.0,female,2007
2,3,Adelie,Torgersen,40.3,18.0,195.0,3250.0,female,2007
3,4,Adelie,Torgersen,,,,,,2007
4,5,Adelie,Torgersen,36.7,19.3,193.0,3450.0,female,2007


#Remove observation having NaN values

In [4]:
data.isnull().sum()

Unnamed: 0            0
species               0
island                0
bill_length_mm        2
bill_depth_mm         2
flipper_length_mm     2
body_mass_g           2
sex                  11
year                  0
dtype: int64

In [5]:
data.dropna(inplace=True)

In [6]:
data.isnull().sum()

Unnamed: 0           0
species              0
island               0
bill_length_mm       0
bill_depth_mm        0
flipper_length_mm    0
body_mass_g          0
sex                  0
year                 0
dtype: int64

#Finding all numerical and categorical features present in the dataset 

In [7]:
from sklearn.compose import make_column_selector
        
cat_cols = make_column_selector(dtype_include=object) (data)
print ('Categorical Feature Variables are', cat_cols)
## or
num_selector = make_column_selector(dtype_include=np.number)
num_cols = num_selector (data)
print ('Numerical Feature Variables are', num_cols) 

Categorical Feature Variables are ['species', 'island', 'sex']
Numerical Feature Variables are ['Unnamed: 0', 'bill_length_mm', 'bill_depth_mm', 'flipper_length_mm', 'body_mass_g', 'year']


# Conversion via Mapping

In [8]:
data.species.unique()

array(['Adelie', 'Gentoo', 'Chinstrap'], dtype=object)

In [9]:
data.species = data.species.map({'Adelie' : 0, 'Gentoo' : 1, 'Chinstrap' : 2})

In [10]:
data.head()

Unnamed: 0.1,Unnamed: 0,species,island,bill_length_mm,bill_depth_mm,flipper_length_mm,body_mass_g,sex,year
0,1,0,Torgersen,39.1,18.7,181.0,3750.0,male,2007
1,2,0,Torgersen,39.5,17.4,186.0,3800.0,female,2007
2,3,0,Torgersen,40.3,18.0,195.0,3250.0,female,2007
4,5,0,Torgersen,36.7,19.3,193.0,3450.0,female,2007
5,6,0,Torgersen,39.3,20.6,190.0,3650.0,male,2007


In [11]:
data.species.unique()

array([0, 1, 2])

#Conversion via Label Encoding

In [12]:
data.island.unique()

array(['Torgersen', 'Biscoe', 'Dream'], dtype=object)

In [13]:
from sklearn.preprocessing import LabelEncoder

In [14]:
le = LabelEncoder()

In [15]:
data.island = le.fit_transform(data.island)

In [16]:
data.tail()

Unnamed: 0.1,Unnamed: 0,species,island,bill_length_mm,bill_depth_mm,flipper_length_mm,body_mass_g,sex,year
339,340,2,1,55.8,19.8,207.0,4000.0,male,2009
340,341,2,1,43.5,18.1,202.0,3400.0,female,2009
341,342,2,1,49.6,18.2,193.0,3775.0,male,2009
342,343,2,1,50.8,19.0,210.0,4100.0,male,2009
343,344,2,1,50.2,18.7,198.0,3775.0,female,2009


#Conversion via One Hot Encoding

Nominal variables - categorical variables that can only be categorized

Ordinal variables - categorical variables that can be categorized and ranked

Since feature variable - sex of penguin  is nominal --- male and female is not smaller or greater than each other

In [17]:
data.sex.unique()

array(['male', 'female'], dtype=object)

In [18]:
dataset = pd.get_dummies(data)

In [19]:
dataset.head()

Unnamed: 0.1,Unnamed: 0,species,island,bill_length_mm,bill_depth_mm,flipper_length_mm,body_mass_g,year,sex_female,sex_male
0,1,0,2,39.1,18.7,181.0,3750.0,2007,0,1
1,2,0,2,39.5,17.4,186.0,3800.0,2007,1,0
2,3,0,2,40.3,18.0,195.0,3250.0,2007,1,0
4,5,0,2,36.7,19.3,193.0,3450.0,2007,1,0
5,6,0,2,39.3,20.6,190.0,3650.0,2007,0,1
