# Dummy variables and One Hot Encoding..

### Using Pandas

In [1]:
import pandas as pd

In [3]:
tips_df = pd.read_csv(r'C:\Users\Dell\dataSets\tips.csv')
tips_df

Unnamed: 0,total_bill,tip,sex,smoker,day,time,size
0,16.99,1.01,Female,No,Sun,Dinner,2
1,10.34,1.66,Male,No,Sun,Dinner,3
2,21.01,3.50,Male,No,Sun,Dinner,3
3,23.68,3.31,Male,No,Sun,Dinner,2
4,24.59,3.61,Female,No,Sun,Dinner,4
...,...,...,...,...,...,...,...
239,29.03,5.92,Male,No,Sat,Dinner,3
240,27.18,2.00,Female,Yes,Sat,Dinner,2
241,22.67,2.00,Male,Yes,Sat,Dinner,2
242,17.82,1.75,Male,No,Sat,Dinner,2


In [4]:
# to create dummy variables use "get_dummies()" method..

dummy_df = pd.get_dummies(tips_df)
dummy_df

Unnamed: 0,total_bill,tip,size,sex_Female,sex_Male,smoker_No,smoker_Yes,day_Fri,day_Sat,day_Sun,day_Thur,time_Dinner,time_Lunch
0,16.99,1.01,2,1,0,1,0,0,0,1,0,1,0
1,10.34,1.66,3,0,1,1,0,0,0,1,0,1,0
2,21.01,3.50,3,0,1,1,0,0,0,1,0,1,0
3,23.68,3.31,2,0,1,1,0,0,0,1,0,1,0
4,24.59,3.61,4,1,0,1,0,0,0,1,0,1,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...
239,29.03,5.92,3,0,1,1,0,0,1,0,0,1,0
240,27.18,2.00,2,1,0,0,1,0,1,0,0,1,0
241,22.67,2.00,2,0,1,0,1,0,1,0,0,1,0
242,17.82,1.75,2,0,1,1,0,0,1,0,0,1,0


#### The above encoding for categorical variables is called "One-Hot Encoding"..

In [5]:
# now see the fact.. in this dataframe format we don't need both male and female section, only one of them
# is sufficient to describe customer....
# simillarly for smoking "yes" or "no" only one column is sufficient... so we have to use "drop_first" attribute.

pd.get_dummies(tips_df, drop_first = True)

Unnamed: 0,total_bill,tip,size,sex_Male,smoker_Yes,day_Sat,day_Sun,day_Thur,time_Lunch
0,16.99,1.01,2,0,0,0,1,0,0
1,10.34,1.66,3,1,0,0,1,0,0
2,21.01,3.50,3,1,0,0,1,0,0
3,23.68,3.31,2,1,0,0,1,0,0
4,24.59,3.61,4,0,0,0,1,0,0
...,...,...,...,...,...,...,...,...,...
239,29.03,5.92,3,1,0,1,0,0,0
240,27.18,2.00,2,0,1,1,0,0,0
241,22.67,2.00,2,1,1,1,0,0,0
242,17.82,1.75,2,1,0,1,0,0,0


### Using Scikit Learn's OneHotEncoder

In [6]:
from sklearn.preprocessing import OneHotEncoder

In [8]:
One_hot_encode = OneHotEncoder(sparse = False) 

In [10]:
One_hot_encode_array = One_hot_encode.fit_transform(tips_df[['sex', 'smoker', 'day', 'time']])
One_hot_encode_array

array([[1., 0., 1., ..., 0., 1., 0.],
       [0., 1., 1., ..., 0., 1., 0.],
       [0., 1., 1., ..., 0., 1., 0.],
       ...,
       [0., 1., 0., ..., 0., 1., 0.],
       [0., 1., 1., ..., 0., 1., 0.],
       [1., 0., 1., ..., 1., 1., 0.]])

In [11]:
# here also we have get Numpy Array not dataframe... so converting it to DATAFRAME

In [13]:
# we can also drop un necessary columns...
One_hot_encode2 = OneHotEncoder(sparse = False, drop = 'first')  # we have an option to drop first or last columns..
One_hot_encode2.fit_transform(tips_df[['sex', 'smoker', 'day', 'time']])

array([[0., 0., 0., 1., 0., 0.],
       [1., 0., 0., 1., 0., 0.],
       [1., 0., 0., 1., 0., 0.],
       ...,
       [1., 1., 1., 0., 0., 0.],
       [1., 0., 1., 0., 0., 0.],
       [0., 0., 0., 0., 1., 0.]])

# Ordinal Variables, Nominal Variables...

#### Ordinal Variables are Categorical Variables in which the categories can be Meaningfully Ordered 
#### When Ordinal Variable convert into number then it has mathematical value..
e.g.- Rating, Grades, Months, Quality etc.

#### Nominal Variables is same like ordinal variables but There is nothing that indicates an Instric Order of the Levels and in principle. All labels are equal, no order. NO matheatical value
e.g.- Colour : red, green, blue, black (there is no mathematical value or priority)

# Label and Ordinal Encoding

In Label Encoding, Categories are encoded by Numbers as their Alphabetical Orders...
In Ordinal Encoding, Categories are encoded by numbers as their weightage or Priority...

In [21]:
import pandas as pd
from sklearn import preprocessing as pp

In [23]:
df = pd.read_csv(r'C:\Users\Dell\dataSets\train.csv')
df.head()

Unnamed: 0,Id,MSSubClass,MSZoning,LotFrontage,LotArea,Street,Alley,LotShape,LandContour,Utilities,...,PoolArea,PoolQC,Fence,MiscFeature,MiscVal,MoSold,YrSold,SaleType,SaleCondition,SalePrice
0,1,60,RL,65.0,8450,Pave,,Reg,Lvl,AllPub,...,0,,,,0,2,2008,WD,Normal,208500
1,2,20,RL,80.0,9600,Pave,,Reg,Lvl,AllPub,...,0,,,,0,5,2007,WD,Normal,181500
2,3,60,RL,68.0,11250,Pave,,IR1,Lvl,AllPub,...,0,,,,0,9,2008,WD,Normal,223500
3,4,70,RL,60.0,9550,Pave,,IR1,Lvl,AllPub,...,0,,,,0,2,2006,WD,Abnorml,140000
4,5,60,RL,84.0,14260,Pave,,IR1,Lvl,AllPub,...,0,,,,0,12,2008,WD,Normal,250000


In [24]:
df2 = df[['KitchenQual','BldgType']]

In [26]:
# let us check number of unique values
df['BldgType'].value_counts()

1Fam      1220
TwnhsE     114
Duplex      52
Twnhs       43
2fmCon      31
Name: BldgType, dtype: int64

### Label Encoding

In [29]:
pp.LabelEncoder().fit_transform(df2['BldgType'])

array([0, 0, 0, ..., 0, 0, 0])

In [30]:
# converting NUMPY array into DATAFRAME
df2["BldgType_L_encd"] = pp.LabelEncoder().fit_transform(df2["BldgType"])
df2

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df2["BldgType_L_encd"] = pp.LabelEncoder().fit_transform(df2["BldgType"])


Unnamed: 0,KitchenQual,BldgType,BldgType_L_encd
0,Gd,1Fam,0
1,TA,1Fam,0
2,Gd,1Fam,0
3,Gd,1Fam,0
4,Gd,1Fam,0
...,...,...,...
1455,TA,1Fam,0
1456,TA,1Fam,0
1457,Gd,1Fam,0
1458,Gd,1Fam,0


### Ordinal Encoding

In [32]:
df["KitchenQual"].value_counts()

TA    735
Gd    586
Ex    100
Fa     39
Name: KitchenQual, dtype: int64

In [47]:
order_label = {"Ex":4, "Gd":3, "TA":2, "Fa":1}

df2["KitchenQual_ord_enc"] = df2["KitchenQual"].map(order_label)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df2["KitchenQual_ord_enc"] = df2["KitchenQual"].map(order_label)


In [48]:
df2

Unnamed: 0,KitchenQual,BldgType,BldgType_L_encd,KitchenQual_ord_enc
0,Gd,1Fam,0,3
1,TA,1Fam,0,2
2,Gd,1Fam,0,3
3,Gd,1Fam,0,3
4,Gd,1Fam,0,3
...,...,...,...,...
1455,TA,1Fam,0,2
1456,TA,1Fam,0,2
1457,Gd,1Fam,0,3
1458,Gd,1Fam,0,3
