# Aim : - Feature Scaling and Dummification
# Apply feature-scaling techniques like standardization and normalization to numerical features.

 # Load the Data

In [None]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import MinMaxScaler, StandardScaler, RobustScaler, Normalizer
from sklearn.cluster import KMeans
from sklearn.impute import SimpleImputer

# 1. Load the Dataset (Assuming 'kc_house_data.csv' is present)
df = pd.read_csv('kc_house_data.csv')

# --- 2. Define and Prepare Features ---

# Numerical Features for Scaling/Clustering/Imputation
NUMERICAL_FEATURES = ['sqft_living', 'sqft_lot', 'yr_built', 'price']
features_all = df[NUMERICAL_FEATURES].values

# Categorical Feature for Dummification
CATEGORICAL_FEATURE_NAME = 'condition'


# 4.1 Rescaling a feature: MinMaxScaler (Normalization)


In [14]:
print("\n--- 4.1 Rescaling: MinMaxScaler ---")
minmax_scaler = MinMaxScaler(feature_range=(0,1))
scaled_features_minmax = minmax_scaler.fit_transform(features_all)
print("First 5 rows of the scaled features array (MinMaxScaler):")
print(scaled_features_minmax[:5])


--- 4.1 Rescaling: MinMaxScaler ---
First 5 rows of the scaled features array (MinMaxScaler):
[[0.06716981 0.00310751 0.47826087 0.01926557]
 [0.17207547 0.00407187 0.44347826 0.06072131]
 [0.03622642 0.00574253 0.28695652 0.01377049]
 [0.12603774 0.00271377 0.56521739 0.06937705]
 [0.10490566 0.00457949 0.75652174 0.05704918]]


# 4.2 Standardizing a Feature: StandardScaler & RobustScaler


In [15]:
print("\n--- 4.2 Standardizing: StandardScaler ---")
standard_scaler = StandardScaler()
standardized_features = standard_scaler.fit_transform(features_all)
print("First 5 rows of the standardized features array:")
print(standardized_features[:5])

print("\n--- 4.2 Standardizing: RobustScaler ---")
robust_scaler = RobustScaler()
robust_scaled_features = robust_scaler.fit_transform(features_all)
print("First 5 rows of the Robust Scaled features array:")
print(robust_scaled_features[:5])


--- 4.2 Standardizing: StandardScaler ---
First 5 rows of the standardized features array:
[[-0.97983502 -0.22832133 -0.54489777 -0.86671733]
 [ 0.53363434 -0.18988538 -0.6810785  -0.00568792]
 [-1.42625404 -0.12329847 -1.29389179 -0.98084935]
 [-0.13055006 -0.2440144  -0.20444594  0.17409044]
 [-0.43542158 -0.16965339  0.54454807 -0.08195753]]

--- 4.2 Standardizing: RobustScaler ---
First 5 rows of the Robust Scaled features array:
[[-0.65004452 -0.34844193 -0.43478261 -0.70608265]
 [ 0.58771149 -0.06657224 -0.52173913  0.27240365]
 [-1.01513802  0.42174221 -0.91304348 -0.83578393]
 [ 0.0445236  -0.46352691 -0.2173913   0.47670639]
 [-0.20480855  0.08179887  0.26086957  0.18572976]]


# 4.3 Normalizing Observations: Normalizer
# (Note: This is rarely used on raw feature data, but here for completeness)

In [16]:
print("\n--- 4.3 Normalizing Observations: Normalizer (L2 Norm) ---")
normalizer = Normalizer(norm="l2")
normalized_features = normalizer.transform(features_all)

print("First 5 rows of the Normalized features array (L2 Norm):")
print(normalized_features[:5])


--- 4.3 Normalizing Observations: Normalizer (L2 Norm) ---
First 5 rows of the Normalized features array (L2 Norm):
[[0.00531571 0.02545232 0.00880695 0.99962311]
 [0.00477643 0.01345951 0.003626   0.99989143]
 [0.00427091 0.05546633 0.01072164 0.99839386]
 [0.00324489 0.00827777 0.00325317 0.99995518]
 [0.00329366 0.01584094 0.00389554 0.99986151]]


# Feature Dummification (Not in original doc headings, but required)


In [17]:
print(f"\n--- Feature Dummification: {CATEGORICAL_FEATURE_NAME} (One-Hot Encoding) ---")
dummy_features = pd.get_dummies(df[CATEGORICAL_FEATURE_NAME], 
                               prefix=CATEGORICAL_FEATURE_NAME, 
                               dtype=int)
print("First 5 rows of Dummified Features:")
print(dummy_features.head())


--- Feature Dummification: condition (One-Hot Encoding) ---
First 5 rows of Dummified Features:
   condition_1  condition_2  condition_3  condition_4  condition_5
0            0            0            1            0            0
1            0            0            1            0            0
2            0            0            1            0            0
3            0            0            0            0            1
4            0            0            1            0            0


# 4.9 Grouping Observations Using Clustering: KMeans



In [18]:
print("\n--- 4.9 Grouping Observations Using Clustering: KMeans ---")
# Use the scaled data for clustering
clusterer = KMeans(n_clusters=3, random_state=42, n_init=10)
# Fit the clusterer to the standardized data
clusterer.fit(standardized_features)
# Predict the cluster group for each house
df['cluster_group'] = clusterer.predict(standardized_features)

print("First 5 rows with new 'cluster_group' column:")
print(df[['price', 'sqft_living', 'cluster_group']].head())


--- 4.9 Grouping Observations Using Clustering: KMeans ---
First 5 rows with new 'cluster_group' column:
      price  sqft_living  cluster_group
0  221900.0         1180              0
1  538000.0         2570              0
2  180000.0          770              0
3  604000.0         1960              0
4  510000.0         1680              2


# 4.10 Deleting Observations with Missing Values
# 4.11 Imputing Missing Values (Demonstration)
# (Using the original DataFrame 'df' for simplicity)


In [19]:
# --- Create a copy of the dataframe and manually add a missing value for demonstration ---
df_missing = df.copy()
# Simulate a missing value (e.g., a missing 'sqft_lot' for the 10th house)
df_missing.loc[10, 'sqft_lot'] = np.nan 
# Also simulate a missing 'price' for the 15th house
df_missing.loc[15, 'price'] = np.nan 

print("\n--- 4.10 Deleting Observations with Missing Values ---")
# df.dropna() removes any row that contains at least one NaN (missing) value
df_cleaned = df_missing.dropna()

print(f"Original Row Count: {len(df_missing)}")
print(f"Cleaned Row Count (after dropna): {len(df_cleaned)}")


print("\n--- 4.11 Imputing Missing Values (Mean Imputation) ---")
# Imputer is deprecated, SimpleImputer is the modern equivalent
imputer = SimpleImputer(strategy="mean")

# Fit and transform the numerical features array
imputed_features = imputer.fit_transform(df_missing[NUMERICAL_FEATURES])

print(f"Shape of Imputed Array (no missing values): {imputed_features.shape}")
print(f"Mean Imputed Value for the first missing 'sqft_lot' (Index 10, Column 1): {imputed_features[10, 1]:.2f}")


--- 4.10 Deleting Observations with Missing Values ---
Original Row Count: 21613
Cleaned Row Count (after dropna): 21611

--- 4.11 Imputing Missing Values (Mean Imputation) ---
Shape of Imputed Array (no missing values): (21613, 4)
Mean Imputed Value for the first missing 'sqft_lot' (Index 10, Column 1): 15107.21
