In [22]:
# Feature scaling - It is a preprocessing step in machine learning where
# you standardize or normalize numerical features so they are on a similar scale.

# What is Feature Scaling?
# It means transforming the values of each numeric feature so that:
# They are not too large or too small compared to others.
# They fit within a certain range (like 0–1 in MinMax Scaling).
# Or have mean = 0 and standard deviation = 1 (in Standardization).

# Common methods:
# StandardScaler → (x – mean) / std             (No range needed as argument)
# MinMaxScaler → (x – min) / (max – min)        () Need to mention feature_range=(0,1) in the syntax

In [34]:
# Recollecting from the last notebook! Step 1: Was to import tools and load your data - 

import pandas as pd
from sklearn.preprocessing import OneHotEncoder

data = pd.read_csv("4CHP_Data.csv")  # California housing data


In [35]:
data 

# As we have ocean_proximity a column where there are text values, 
# Machine learning models can’t understand text directly — so we’ll use One-Hot Encoding 
# to convert it into numbers (binary columns: 0/1).

Unnamed: 0,longitude,latitude,housing_median_age,total_rooms,total_bedrooms,population,households,median_income,median_house_value,ocean_proximity
0,-122.23,37.88,41.0,880.0,129.0,322.0,126.0,8.3252,452600.0,NEAR BAY
1,-122.22,37.86,21.0,7099.0,1106.0,2401.0,1138.0,8.3014,358500.0,NEAR BAY
2,-122.24,37.85,52.0,1467.0,190.0,496.0,177.0,7.2574,352100.0,NEAR BAY
3,-122.25,37.85,52.0,1274.0,235.0,558.0,219.0,5.6431,341300.0,NEAR BAY
4,-122.25,37.85,52.0,1627.0,280.0,565.0,259.0,3.8462,342200.0,NEAR BAY
...,...,...,...,...,...,...,...,...,...,...
20635,-121.09,39.48,25.0,1665.0,374.0,845.0,330.0,1.5603,78100.0,INLAND
20636,-121.21,39.49,18.0,697.0,150.0,356.0,114.0,2.5568,77100.0,INLAND
20637,-121.22,39.43,17.0,2254.0,485.0,1007.0,433.0,1.7000,92300.0,INLAND
20638,-121.32,39.43,18.0,1860.0,409.0,741.0,349.0,1.8672,84700.0,INLAND


In [36]:
data["ocean_proximity"].unique()  # give you all the categories.

# Noticed that this column with text values does not maintain a ranking or order, it just different categories.
# So next step - To use One-Hot Encoding

array(['NEAR BAY', '<1H OCEAN', 'INLAND', 'NEAR OCEAN', 'ISLAND'],
      dtype=object)

In [37]:
# STEP 2. Select the Categorical Column/columns

data_op = data[["ocean_proximity"]]  # categorical column seperated
data_op

Unnamed: 0,ocean_proximity
0,NEAR BAY
1,NEAR BAY
2,NEAR BAY
3,NEAR BAY
4,NEAR BAY
...,...
20635,INLAND
20636,INLAND
20637,INLAND
20638,INLAND


In [38]:
# Step 3. 
# Created the encoder Object :   #----First step to use Encoder---- create the encoder object.

encoder = OneHotEncoder()
# This creates an encoder that will convert categories into nums.

In [39]:
# STEP 4. 
# Fit and Transform the Data using encoder  

data_op_encoded = encoder.fit_transform(data_op) 

# fit() → Learns the unique categories (like ‘INLAND’, ‘NEAR BAY’, etc.)
# transform() → Creates new numeric columns (0/1) for each category


#-----------------------------------Encoder returns Sparse Matrix----------------------------------------Explianed---------------
# Encoder returns a sparse matrix, not a normal pandas DataFrame.

# What is a sparse matrix?
# A sparse matrix is a matrix where most of the elements have a value of zero.
# Instead of storing all the zeros, which wastes memory, sparse matrices are stored
# in a way that only keeps track of the non-zero elements and their positions. 
# This makes them more efficient for both storage and computation time in applications like machine learning

# That’s great for performance, but not so human-friendly to look at.
# That’s why we often convert it to pandas DataFrame — for readability and later merging.

In [40]:
# STEP 5. Check the encoded result     

data_op_encoded.toarray()

array([[0., 0., 0., 1., 0.],
       [0., 0., 0., 1., 0.],
       [0., 0., 0., 1., 0.],
       ...,
       [0., 1., 0., 0., 0.],
       [0., 1., 0., 0., 0.],
       [0., 1., 0., 0., 0.]])

In [41]:
# STEP 6. Check category names  : 
encoder.categories_

[array(['<1H OCEAN', 'INLAND', 'ISLAND', 'NEAR BAY', 'NEAR OCEAN'],
       dtype=object)]

In [42]:
# STEP 7. Convert to DataFrame For better readability   

import pandas as pd

data_op_encoded_df = pd.DataFrame(
    data_op_encoded.toarray(),     # convert sparse → dense numpy array
    columns=encoder.categories_[0]     # use the category names as column name, 0 because only one text columnn was encoaded.
)


#------------Explaination----------AND---Syntax--------------
# imported pandas as we have to convert encoaded data into Pandas Datframe

# Syntax - 
# pd.DataFrame(data, index=None, columns=None, dtype=None, copy=False)
# data - it can be various types - dictionary/list of lists/NumPy array/etc))  1st arg
# We gave data a Numoy array - data_op_encoded.toarray()   1st arg
# and column - columns=encoder.categories_[0]              2nd arg
# [0] as we only had one column so index 0, incase we had multiple non numeric columns we would use [0], [1], [2]

In [43]:
# Step 9: Merge encoded columns with numeric data - 
# combined with your numeric columns so the model can use all features together.

data_num = data.drop("ocean_proximity", axis=1)                      # this drops the non numeric column from rest of the data 
df_prepared = pd.concat([data_num, data_op_encoded_df], axis=1)    # this adds encoaded column for non numeric column 

# Fully numeric data is prepared and ready for machine learning algorithms.

In [44]:
# Step 0: Recall what we have done so far - 

# In our last notebook - 7.Preprocessing :

# 1.  shuffled the Data with stratified shuffle split from Scikit-Learn. 

# 2.  split our data into two parts, Train and test.

# 3.  copied our train_set. 

# 4.  seperated the features and label for further steps.

# 5.  Handled Missing Data / NaN, used SimpleImputer from Scikit-Learn.

# 6.  After imputation with Strategy= Median, We used imputer.transform() 
# -   to replace NaNs with the median of the column using transformed. 

# 7.  Handle Categorical Variables with One-Hot Encoding :
      # In order to use OneHotEncoder we have to seperate the cetegorical column so that we
      # can apply encoder, and then fit and transform. 
      # 1. Select the Categorical Column/columns - op
      # 2. Created the encoder Object 
      # 3. Fit and Transform the Data using encoder  

# 8.  Convert to DataFrame - Add the encoded Categorical column (data_prepared) , back to the dataframe .

# 9. Feature scaling - To Apply Scaling to our columns where numerical diffence is too much. 
     # 1. import scaler MinMaxScaler 
     # 2. Create a scaler object
     # 3. Fit + Transform your scaled data

# 10. 

In [45]:
df_prepared

Unnamed: 0,longitude,latitude,housing_median_age,total_rooms,total_bedrooms,population,households,median_income,median_house_value,<1H OCEAN,INLAND,ISLAND,NEAR BAY,NEAR OCEAN
0,-122.23,37.88,41.0,880.0,129.0,322.0,126.0,8.3252,452600.0,0.0,0.0,0.0,1.0,0.0
1,-122.22,37.86,21.0,7099.0,1106.0,2401.0,1138.0,8.3014,358500.0,0.0,0.0,0.0,1.0,0.0
2,-122.24,37.85,52.0,1467.0,190.0,496.0,177.0,7.2574,352100.0,0.0,0.0,0.0,1.0,0.0
3,-122.25,37.85,52.0,1274.0,235.0,558.0,219.0,5.6431,341300.0,0.0,0.0,0.0,1.0,0.0
4,-122.25,37.85,52.0,1627.0,280.0,565.0,259.0,3.8462,342200.0,0.0,0.0,0.0,1.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
20635,-121.09,39.48,25.0,1665.0,374.0,845.0,330.0,1.5603,78100.0,0.0,1.0,0.0,0.0,0.0
20636,-121.21,39.49,18.0,697.0,150.0,356.0,114.0,2.5568,77100.0,0.0,1.0,0.0,0.0,0.0
20637,-121.22,39.43,17.0,2254.0,485.0,1007.0,433.0,1.7000,92300.0,0.0,1.0,0.0,0.0,0.0
20638,-121.32,39.43,18.0,1860.0,409.0,741.0,349.0,1.8672,84700.0,0.0,1.0,0.0,0.0,0.0


In [46]:
# -----------------------NOTEBOOK 10. FEATURE SCALING-----STARTS FROM  HERE------------------------
# 9. Feature scaling - As we noticed in our prepared dataframe, total_rooms ranges from 6 to over 39,000, 

# median_income ranges from 0 to 15 If you don’t scale these features, models will give more

# importance to total_rooms simply because it has larger values.

#--------------------------------------------
# We  will use Min-Max Scaling (Normalization) To scale our data to a specific range.
# usually [0, 1] or [-1, 1].

# Formula: 
# scaled_value = (x - min) / (max - min)
# Lets apply scaling below  - 

#--------------------------------------------
# Steps To Scale  - To Apply Scaling to our data as we have numerical diffence is too much, in some columns in our data
# So we will scale our entire Data using MinMax Mehtod.

# 1. import scaler MinMaxScaler 
# 2. Create a scaler object
# 3. Fit + Transform your scaled data
# 4. Convert to DataFrame - Add the scaled data , back to the dataframe .

In [47]:
from sklearn.preprocessing import MinMaxScaler    # Or StandardScaler

In [53]:
scaler = MinMaxScaler(feature_range=(-1,1))   # StandardScaler() 

In [54]:
df_scaled = scaler.fit_transform(df_prepared)

In [55]:
df_scaled = pd.DataFrame(df_scaled, columns = df_prepared.columns, index = df_prepared.index )

In [56]:
df_scaled                       # We got our Scaled Data, in Range from -1 to 1.

Unnamed: 0,longitude,latitude,housing_median_age,total_rooms,total_bedrooms,population,households,median_income,median_house_value,<1H OCEAN,INLAND,ISLAND,NEAR BAY,NEAR OCEAN
0,-0.577689,0.134963,0.568627,-0.955339,-0.960273,-0.982118,-0.958888,0.079337,0.804533,-1.0,-1.0,-1.0,1.0,-1.0
1,-0.575697,0.130712,-0.215686,-0.638995,-0.657045,-0.865579,-0.626048,0.076054,0.416493,-1.0,-1.0,-1.0,1.0,-1.0
2,-0.579681,0.128587,1.000000,-0.925479,-0.941341,-0.972365,-0.942115,-0.067944,0.390101,-1.0,-1.0,-1.0,1.0,-1.0
3,-0.581673,0.128587,1.000000,-0.935297,-0.927374,-0.968889,-0.928301,-0.290603,0.345566,-1.0,-1.0,-1.0,1.0,-1.0
4,-0.581673,0.128587,1.000000,-0.917341,-0.913408,-0.968497,-0.915146,-0.538448,0.349277,-1.0,-1.0,-1.0,1.0,-1.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
20635,-0.350598,0.475027,-0.058824,-0.915408,-0.884233,-0.952801,-0.891794,-0.853740,-0.739791,-1.0,1.0,-1.0,-1.0,-1.0
20636,-0.374502,0.477152,-0.333333,-0.964647,-0.953755,-0.980212,-0.962835,-0.716294,-0.743914,-1.0,1.0,-1.0,-1.0,-1.0
20637,-0.376494,0.464400,-0.372549,-0.885447,-0.849783,-0.943720,-0.857918,-0.834471,-0.681234,-1.0,1.0,-1.0,-1.0,-1.0
20638,-0.396414,0.464400,-0.333333,-0.905489,-0.873371,-0.958631,-0.885545,-0.811409,-0.712574,-1.0,1.0,-1.0,-1.0,-1.0
