# Encoding Categorical data

Categorical data, also known as nominal or ordinal data, is a type of data that consists of values that fall into distinct categories or groups. Unlike numerical data, which represents measurable quantities, categorical data represents qualitative or descriptive characteristics. It is crucial to understand categorical data when working with machine learning models, as most models require numerical inputs.

In [1]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split

In [2]:
data_url = 'http://vincentarelbundock.github.io/Rdatasets/csv/carData/Salaries.csv'
df = pd.read_csv(data_url, index_col =0 )
df

Unnamed: 0_level_0,rank,discipline,yrs.since.phd,yrs.service,sex,salary
rownames,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
1,Prof,B,19,18,Male,139750
2,Prof,B,20,16,Male,173200
3,AsstProf,B,4,3,Male,79750
4,Prof,B,45,39,Male,115000
5,Prof,B,40,41,Male,141500
...,...,...,...,...,...,...
393,Prof,A,33,30,Male,103106
394,Prof,A,31,19,Male,150564
395,Prof,A,42,25,Male,101738
396,Prof,A,25,15,Male,95329


In [3]:
df_dummies = pd.get_dummies(df['rank'])
df_dummies

Unnamed: 0_level_0,AssocProf,AsstProf,Prof
rownames,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
1,False,False,True
2,False,False,True
3,False,True,False
4,False,False,True
5,False,False,True
...,...,...,...
393,False,False,True
394,False,False,True
395,False,False,True
396,False,False,True


In [6]:
df_dummies = pd.get_dummies(df[['rank','sex']])
df_dummies

Unnamed: 0,rank_AssocProf,rank_AsstProf,rank_Prof,sex_Female,sex_Male
1,0,0,1,0,1
2,0,0,1,0,1
3,0,1,0,0,1
4,0,0,1,0,1
5,0,0,1,0,1
...,...,...,...,...,...
393,0,0,1,0,1
394,0,0,1,0,1
395,0,0,1,0,1
396,0,0,1,0,1


In [7]:
df_dummies = pd.get_dummies(df[['rank','sex','discipline']],prefix = '', prefix_sep ='')
df_dummies.head(10)

Unnamed: 0,AssocProf,AsstProf,Prof,Female,Male,A,B
1,0,0,1,0,1,0,1
2,0,0,1,0,1,0,1
3,0,1,0,0,1,0,1
4,0,0,1,0,1,0,1
5,0,0,1,0,1,0,1
6,1,0,0,0,1,0,1
7,0,0,1,0,1,0,1
8,0,0,1,0,1,0,1
9,0,0,1,0,1,0,1
10,0,0,1,1,0,0,1


In [8]:
df_dummies = pd.get_dummies(df[['rank','sex','discipline']],prefix = '', prefix_sep ='', drop_first=True)
df_dummies.head()

Unnamed: 0,AsstProf,Prof,Male,B
1,0,1,1,1
2,0,1,1,1
3,1,0,1,1
4,0,1,1,1
5,0,1,1,1


In [9]:
df2 = pd.concat([df,df_dummies], axis = 1)
df2

Unnamed: 0,rank,discipline,yrs.since.phd,yrs.service,sex,salary,AsstProf,Prof,Male,B
1,Prof,B,19,18,Male,139750,0,1,1,1
2,Prof,B,20,16,Male,173200,0,1,1,1
3,AsstProf,B,4,3,Male,79750,1,0,1,1
4,Prof,B,45,39,Male,115000,0,1,1,1
5,Prof,B,40,41,Male,141500,0,1,1,1
...,...,...,...,...,...,...,...,...,...,...
393,Prof,A,33,30,Male,103106,0,1,1,0
394,Prof,A,31,19,Male,150564,0,1,1,0
395,Prof,A,42,25,Male,101738,0,1,1,0
396,Prof,A,25,15,Male,95329,0,1,1,0


# Drop rank, sex and discipline columns because their dummy variables have already been created

In [10]:
df2.drop(['rank','sex','discipline'], axis = 1, inplace = True)

In [11]:
df2

Unnamed: 0,yrs.since.phd,yrs.service,salary,AsstProf,Prof,Male,B
1,19,18,139750,0,1,1,1
2,20,16,173200,0,1,1,1
3,4,3,79750,1,0,1,1
4,45,39,115000,0,1,1,1
5,40,41,141500,0,1,1,1
...,...,...,...,...,...,...,...
393,33,30,103106,0,1,1,0
394,31,19,150564,0,1,1,0
395,42,25,101738,0,1,1,0
396,25,15,95329,0,1,1,0


In [16]:
X = df2.drop(['salary'], axis = 1).values

In [17]:
X

array([[19, 18,  0,  1,  1,  1],
       [20, 16,  0,  1,  1,  1],
       [ 4,  3,  1,  0,  1,  1],
       ...,
       [42, 25,  0,  1,  1,  0],
       [25, 15,  0,  1,  1,  0],
       [ 8,  4,  1,  0,  1,  0]])

In [18]:
y = df2['salary']

In [19]:
y

1      139750
2      173200
3       79750
4      115000
5      141500
        ...  
393    103106
394    150564
395    101738
396     95329
397     81035
Name: salary, Length: 397, dtype: int64

In [20]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 42)

In [22]:
from sklearn.preprocessing import StandardScaler
sc = StandardScaler()
X_train = sc.fit_transform(X_train)
X_test = sc.transform(X_test)