# One Hot Encoding

`One Hot Encoding` is used to convert the `categorical` data into `numeric`

In [None]:
import pandas as pd

In [None]:
df = pd.read_csv('/content/insurance.csv')

In [None]:
df.head()

Unnamed: 0,age,sex,bmi,children,smoker,region,charges
0,19,female,27.9,0,yes,southwest,16884.924
1,18,male,33.77,1,no,southeast,1725.5523
2,28,male,33.0,3,no,southeast,4449.462
3,33,male,22.705,0,no,northwest,21984.47061
4,32,male,28.88,0,no,northwest,3866.8552


In [None]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1338 entries, 0 to 1337
Data columns (total 7 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   age       1338 non-null   int64  
 1   sex       1338 non-null   object 
 2   bmi       1338 non-null   float64
 3   children  1338 non-null   int64  
 4   smoker    1338 non-null   object 
 5   region    1338 non-null   object 
 6   charges   1338 non-null   float64
dtypes: float64(2), int64(2), object(3)
memory usage: 73.3+ KB


In [None]:
#creating a list that holds the names of columns which are categorical in nature
catg_cols = df.select_dtypes(include = 'object').columns.tolist()

In [None]:
catg_cols

['sex', 'smoker', 'region']

In [None]:
#one hot encoding
from sklearn.preprocessing import OneHotEncoder

encoder = OneHotEncoder(sparse = False, handle_unknown='ignore')

encoder.fit(df[catg_cols])



In [None]:
encoder.categories_

[array(['female', 'male'], dtype=object),
 array(['no', 'yes'], dtype=object),
 array(['northeast', 'northwest', 'southeast', 'southwest'], dtype=object)]

In [None]:
#new column names
encoded_cols = list(encoder.get_feature_names_out(catg_cols))

In [None]:
encoded_cols

['sex_female',
 'sex_male',
 'smoker_no',
 'smoker_yes',
 'region_northeast',
 'region_northwest',
 'region_southeast',
 'region_southwest']

In [None]:
#transforming the data
df[encoded_cols] = encoder.transform(df[catg_cols])

In [None]:
df.head()

Unnamed: 0,age,sex,bmi,children,smoker,region,charges,sex_female,sex_male,smoker_no,smoker_yes,region_northeast,region_northwest,region_southeast,region_southwest
0,19,female,27.9,0,yes,southwest,16884.924,1.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0
1,18,male,33.77,1,no,southeast,1725.5523,0.0,1.0,1.0,0.0,0.0,0.0,1.0,0.0
2,28,male,33.0,3,no,southeast,4449.462,0.0,1.0,1.0,0.0,0.0,0.0,1.0,0.0
3,33,male,22.705,0,no,northwest,21984.47061,0.0,1.0,1.0,0.0,0.0,1.0,0.0,0.0
4,32,male,28.88,0,no,northwest,3866.8552,0.0,1.0,1.0,0.0,0.0,1.0,0.0,0.0


In [None]:
df.drop(columns = catg_cols, inplace = True)

In [None]:
df.head()

Unnamed: 0,age,bmi,children,charges,sex_female,sex_male,smoker_no,smoker_yes,region_northeast,region_northwest,region_southeast,region_southwest
0,19,27.9,0,16884.924,1.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0
1,18,33.77,1,1725.5523,0.0,1.0,1.0,0.0,0.0,0.0,1.0,0.0
2,28,33.0,3,4449.462,0.0,1.0,1.0,0.0,0.0,0.0,1.0,0.0
3,33,22.705,0,21984.47061,0.0,1.0,1.0,0.0,0.0,1.0,0.0,0.0
4,32,28.88,0,3866.8552,0.0,1.0,1.0,0.0,0.0,1.0,0.0,0.0
