In [1]:
# Handling Categorical values -         ********* USING ENCODERS *********
    
# ML models (like linear regression, decision trees, etc.) can only work with numbers, not text labels.
# We need to convert these categories into numeric form â€” thatâ€™s where encoders come in.

# WHICH ENCODER TO USE WHEN : 
# ðŸ”¹ OrdinalEncoder â†’ for ordered categories   (Original)
# ðŸ”¹ OneHotEncoder â†’ for unordered categories    (Nominal)

# DEFINITIONS :
# One-hot encoding - 
# It is a technique that converts categorical data into a numerical format.
# by creating a new binary column for each unique category

# Ordinal encoding - 
# It is a data preprocessing technique that converts categorical data with an inherent order into numerical values.

# HOW TO USE ENCODERS :  ---STEPS---
# 1. Import the Encoder
# 2. Select the Categorical Column/columns
# 3. Create encoder obejct                                     encoder = OrdinalEncoder() or OneHotEncoder()
# 4. Fit and Transform the Data using encoder                 .fit_transform(categorial_column)
# 5. Check the encoded result                           using .toarray() as By default OneHotEncoder returns a sparse matrix
# 6. Check category names                                     using encoder.categories_
# 7. Convert to DataFrame For better readability                    using pd.DataFrame(..., columns=encoder.categories_[0])
# 8. Merge with numeric columns                                    using pd.concat([...], axis=1)


In [2]:
#---------------------------------STEPS OF THIS ML WORKFLOW---------------------------------CFP DATA--------------

# Split or shuffle your data into train/test sets
# (so that training and testing are independent).

# Encode your categorical columns on the training data.
# (So the model can use it mathematically.)

# Apply the same encoding to the test data before making predictions.

In [3]:
# Step 0: Recall what we have done so far - 

# In our last notebook - 7.Preprocessing :
# 1.  shuffled the Data with stratified shuffle split from Scikit-Learn. 
# 2.  split our data into two parts, Train and test.
# 3.  copied our train_set. 
# 4.  seperated the features and label for further steps.
# 5.  Handled Missing Data / NaN, used SimpleImputer from Scikit-Learn.
# 6.  After imputation with Strategy= Median, We used imputer.transform() 
# -   to replace NaNs with the median of the column using transformed. 


In [22]:
# Step 1: Import tools and load your data - 

import pandas as pd
from sklearn.preprocessing import OneHotEncoder

data = pd.read_csv("4CHP_Data.csv")  # California housing data


In [23]:
housing 

# As we have ocean_proximity a column where there are text values, 
# Machine learning models canâ€™t understand text directly â€” so weâ€™ll use One-Hot Encoding 
# to convert it into numbers (binary columns: 0/1).

Unnamed: 0,longitude,latitude,housing_median_age,total_rooms,total_bedrooms,population,households,median_income,median_house_value,ocean_proximity
0,-122.23,37.88,41.0,880.0,129.0,322.0,126.0,8.3252,452600.0,NEAR BAY
1,-122.22,37.86,21.0,7099.0,1106.0,2401.0,1138.0,8.3014,358500.0,NEAR BAY
2,-122.24,37.85,52.0,1467.0,190.0,496.0,177.0,7.2574,352100.0,NEAR BAY
3,-122.25,37.85,52.0,1274.0,235.0,558.0,219.0,5.6431,341300.0,NEAR BAY
4,-122.25,37.85,52.0,1627.0,280.0,565.0,259.0,3.8462,342200.0,NEAR BAY
...,...,...,...,...,...,...,...,...,...,...
20635,-121.09,39.48,25.0,1665.0,374.0,845.0,330.0,1.5603,78100.0,INLAND
20636,-121.21,39.49,18.0,697.0,150.0,356.0,114.0,2.5568,77100.0,INLAND
20637,-121.22,39.43,17.0,2254.0,485.0,1007.0,433.0,1.7000,92300.0,INLAND
20638,-121.32,39.43,18.0,1860.0,409.0,741.0,349.0,1.8672,84700.0,INLAND


In [24]:
data["ocean_proximity"].unique()  

# Noticed that this column with text values (op) does not maintain a ranking or order, it just different categories.
# So next step - To use One-Hot Encoding

array(['NEAR BAY', '<1H OCEAN', 'INLAND', 'NEAR OCEAN', 'ISLAND'],
      dtype=object)

In [25]:
# STEP 2. Select the Categorical Column/columns

data_op = data[["ocean_proximity"]]  # categorical column
data_op

Unnamed: 0,ocean_proximity
0,NEAR BAY
1,NEAR BAY
2,NEAR BAY
3,NEAR BAY
4,NEAR BAY
...,...
20635,INLAND
20636,INLAND
20637,INLAND
20638,INLAND


In [26]:
# Step 3. 
# Created the encoder Object :   #----First step to use Encoder---- create the encoder object.

encoder = OneHotEncoder()
# This creates an encoder that will convert categories into nums.

In [27]:
# STEP 4. 
# Fit and Transform the Data using encoder  

data_op_encoded = encoder.fit_transform(data_op) 

# fit() â†’ Learns the unique categories (like â€˜INLANDâ€™, â€˜NEAR BAYâ€™, etc.)
# transform() â†’ Creates new numeric columns (0/1) for each category


#-----------------------------------Encoder returns Sparse Matrix----------------------------------------Explianed---------------
# Encoder returns a sparse matrix, not a normal pandas DataFrame.

# What is a sparse matrix?
# A sparse matrix is a matrix where most of the elements have a value of zero.
# Instead of storing all the zeros, which wastes memory, sparse matrices are stored
# in a way that only keeps track of the non-zero elements and their positions. 
# This makes them more efficient for both storage and computation time in applications like machine learning

# Thatâ€™s great for performance, but not so human-friendly to look at.
# Thatâ€™s why we often convert it to pandas DataFrame â€” for readability and later merging.

In [28]:
# STEP 5. Check the encoded result     

data_op_encoded.toarray()

array([[0., 0., 0., 1., 0.],
       [0., 0., 0., 1., 0.],
       [0., 0., 0., 1., 0.],
       ...,
       [0., 1., 0., 0., 0.],
       [0., 1., 0., 0., 0.],
       [0., 1., 0., 0., 0.]])

In [29]:
# STEP 6. Check category names  : 
encoder.categories_

[array(['<1H OCEAN', 'INLAND', 'ISLAND', 'NEAR BAY', 'NEAR OCEAN'],
       dtype=object)]

In [32]:
# STEP 7. Convert to DataFrame For better readability   

import pandas as pd

data_op_encoded_df = pd.DataFrame(
    data_op_encoded.toarray(),     # convert sparse â†’ dense numpy array
    columns=encoder.categories_[0]     # use the category names as column name, 0 because only one text columnn was encoaded.
)


#------------Explaination----------AND---Syntax--------------
# imported pandas as we have to convert encoaded data into Pandas Datframe

# Syntax - 
# pd.DataFrame(data, index=None, columns=None, dtype=None, copy=False)
# data - it can be various types - dictionary/list of lists/NumPy array/etc))  1st arg
# We gave data a Numoy array - data_op_encoded.toarray()   1st arg
# and column - columns=encoder.categories_[0]              2nd arg
# [0] as we only had one column so index 0, incase we had multiple non numeric columns we would use [0], [1], [2]

In [33]:
# Step 9: Merge encoded columns with numeric data - 
# combined with your numeric columns so the model can use all features together.

data_num = data.drop("ocean_proximity", axis=1)                      # this drops the non numeric column from rest of the data 
data_prepared = pd.concat([data_num, data_op_encoded_df], axis=1)    # this adds encoaded column for non numeric column 

# Fully numeric data is prepared and ready for machine learning algorithms.