# Sheth L.U.J. & Sir M.V. College Of Arts, Science & Commerce
 
# Shobit Halse | T083

# Aim : Feature Scaling and Dummification
# â€¢ Apply feature-scaling techniques like standardization and normalization to
numerical features.

In [1]:
import pandas as pd
import numpy as np
from sklearn import preprocessing
from sklearn.preprocessing import Normalizer, StandardScaler
from sklearn.datasets import make_blobs
from sklearn.cluster import KMeans
from sklearn.impute import SimpleImputer

df = pd.read_csv('housing.csv')
print("Dataset shape:", df.shape)
print("\nFirst few rows:")
print(df.head())
print("\nDataset info:")
print(df.info())

Dataset shape: (20640, 10)

First few rows:
   longitude  latitude  housing_median_age  total_rooms  total_bedrooms  \
0    -122.23     37.88                41.0        880.0           129.0   
1    -122.22     37.86                21.0       7099.0          1106.0   
2    -122.24     37.85                52.0       1467.0           190.0   
3    -122.25     37.85                52.0       1274.0           235.0   
4    -122.25     37.85                52.0       1627.0           280.0   

   population  households  median_income  median_house_value ocean_proximity  
0       322.0       126.0         8.3252            452600.0        NEAR BAY  
1      2401.0      1138.0         8.3014            358500.0        NEAR BAY  
2       496.0       177.0         7.2574            352100.0        NEAR BAY  
3       558.0       219.0         5.6431            341300.0        NEAR BAY  
4       565.0       259.0         3.8462            342200.0        NEAR BAY  

Dataset info:
<class 'pandas.c

3.1 Rescaling a feature

In [2]:
feature = df[['median_income']].values

minmax_scaler = preprocessing.MinMaxScaler(feature_range=(0,1))
scaled_feature = minmax_scaler.fit_transform(feature)

print(scaled_feature[:10])

Original feature (first 5 values):
[[8.3252]
 [8.3014]
 [7.2574]
 [5.6431]
 [3.8462]]

Scaled feature (first 5 values):
[[0.53966842]
 [0.53802706]
 [0.46602805]
 [0.35469856]
 [0.23077613]]


3.2 Standardizing a Feature

In [3]:
feature = df[['total_rooms']].values

scaler = preprocessing.StandardScaler()
standardized = scaler.fit_transform(feature)

print(standardized[:10])

print("Mean and Standard Deviation")

print("Mean: {}".format(round(standardized.mean())))
print("Standard Deviation: {}".format(standardized.std()))

print("Using RobustScaler")

robust_scaler = preprocessing.RobustScaler()
robust_scaled = robust_scaler.fit_transform(feature)

print(robust_scaled[:10])

Original feature statistics:
Mean: 2635.76
Std: 2181.56

Standardized feature statistics:
Mean: 0
Standard Deviation: 1.00


# Using RobustScaler

In [4]:
feature = df[['population']].values

robust_scaler = preprocessing.RobustScaler()
robust_scaled = robust_scaler.fit_transform(feature)

print("Robust scaled feature (first 5 values):")
print(robust_scaled[:5])

Robust scaled feature (first 5 values):
[[-0.89978678]
 [ 1.31663113]
 [-0.71428571]
 [-0.64818763]
 [-0.64072495]]


# Normalizing Observations

In [5]:
features = df[['median_house_value', 'population']].values

normalizer = Normalizer(norm="l2")

normalizer.transform(features[:10])

Original features:
[[  37.88 -122.23]
 [  37.86 -122.22]
 [  37.85 -122.24]
 [  37.85 -122.25]
 [  37.85 -122.25]]

Normalized features (L2):
[[ 0.29601821 -0.9551823 ]
 [ 0.29589769 -0.95521964]
 [ 0.29578221 -0.9552554 ]
 [ 0.29576013 -0.95526224]
 [ 0.29576013 -0.95526224]]

Sum of the first observation's values (L1):
-0.5268253076010243


In [None]:
features_l1_norm = Normalizer(norm="l1").transform(features)

print("Sum of the first observation's values: {}".format(
    features_l1_norm[0].sum()
))

3.4 Grouping Observations Using Clustering

In [6]:
features = df[['median_income', 'population']].values

clusterer = KMeans(3, random_state=0)
clusterer.fit(features)

df['group'] = clusterer.predict(features)

df.head(6)

Clustered data:
   feature_1  feature_2  group
0      37.88    -122.23      1
1      37.86    -122.22      1
2      37.85    -122.24      1
3      37.85    -122.25      1
4      37.85    -122.25      1

Cluster distribution:
group
1    46
2    28
0    26
Name: count, dtype: int64


3.5 Deleting Observations with Missing Values

In [7]:
df2 = df[['median_income', 'population']]

df2.dropna()[:10]

Missing values in dataset:
longitude               0
latitude                0
housing_median_age      0
total_rooms             0
total_bedrooms        207
population              0
households              0
median_income           0
median_house_value      0
ocean_proximity         0
dtype: int64

Original array:
[[ 1.1 11.1]
 [ 2.2 22.2]
 [ 3.3 33.3]
 [ nan 55. ]]

Array after removing missing values:
[[ 1.1 11.1]
 [ 2.2 22.2]
 [ 3.3 33.3]]

DataFrame after dropna():
   feature_1  feature_2
0        1.1       11.1
1        2.2       22.2
2        3.3       33.3


# Imputing Missing Values

In [8]:
feature_with_missing = df[['total_bedrooms']].values.copy()

print("Missing values count:", np.isnan(feature_with_missing).sum())
print("Original first 10 values:")
print(feature_with_missing[:10])

mean_imputer = SimpleImputer(strategy="mean")
features_mean_imputed = mean_imputer.fit_transform(feature_with_missing)

print("\nAfter mean imputation (first 10 values):")
print(features_mean_imputed[:10])
print(f"\nImputed value (mean): {mean_imputer.statistics_[0]:.2f}")

median_imputer = SimpleImputer(strategy="median")
features_median_imputed = median_imputer.fit_transform(feature_with_missing)

print(f"Imputed value (median): {median_imputer.statistics_[0]:.2f}")

Missing values count: 207
Original first 10 values:
[[ 129.]
 [1106.]
 [ 190.]
 [ 235.]
 [ 280.]
 [ 213.]
 [ 489.]
 [ 687.]
 [ 665.]
 [ 707.]]

After mean imputation (first 10 values):
[[ 129.]
 [1106.]
 [ 190.]
 [ 235.]
 [ 280.]
 [ 213.]
 [ 489.]
 [ 687.]
 [ 665.]
 [ 707.]]

Imputed value (mean): 537.87
Imputed value (median): 435.00


# Summary Statistics

In [9]:
print("Summary statistics of numerical features:")
print(df.describe())

Summary statistics of numerical features:
          longitude      latitude  housing_median_age   total_rooms  \
count  20640.000000  20640.000000        20640.000000  20640.000000   
mean    -119.569704     35.631861           28.639486   2635.763081   
std        2.003532      2.135952           12.585558   2181.615252   
min     -124.350000     32.540000            1.000000      2.000000   
25%     -121.800000     33.930000           18.000000   1447.750000   
50%     -118.490000     34.260000           29.000000   2127.000000   
75%     -118.010000     37.710000           37.000000   3148.000000   
max     -114.310000     41.950000           52.000000  39320.000000   

       total_bedrooms    population    households  median_income  \
count    20433.000000  20640.000000  20640.000000   20640.000000   
mean       537.870553   1425.476744    499.539680       3.870671   
std        421.385070   1132.462122    382.329753       1.899822   
min          1.000000      3.000000      1.000