# Aim : - Feature Scaling and Dummification

# Perform feature dummification to convert categorical variables into numerical representations.

# Load Data

In [None]:
# --- CATEGORICAL AND IMBALANCED DATA

import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelBinarizer, MultiLabelBinarizer 
from sklearn.feature_extraction import DictVectorizer 
from sklearn.neighbors import KNeighborsClassifier
from sklearn.impute import SimpleImputer
from sklearn.ensemble import RandomForestClassifier

# 1. Load the Dataset

df = pd.read_csv('kc_house_data.csv')

# --- 2. Define Features for the tasks ---
NOMINAL_FEATURE = 'zipcode'
ORDINAL_FEATURE = 'grade'
IMBALANCED_TARGET = 'waterfront' 

# Encoding Nominal Categorical Features (Using LabelBinarizer )

In [3]:
print("\n--- Encoding Nominal Categorical Feature: LabelBinarizer (zipcode) ---")
# Use only the first 100 rows to limit the number of zipcodes for cleaner output
nominal_feature_array = df[NOMINAL_FEATURE].head(100).values.reshape(-1, 1)

# Create one-hot encoder
one_hot = LabelBinarizer()

# One-hot encode feature
one_hot_encoded = one_hot.fit_transform(nominal_feature_array)

print("Shape of encoded array:", one_hot_encoded.shape)
print("First 5 rows of one-hot encoded array:")
print(one_hot_encoded[:5])

print("\n--- Using pandas get_dummies (as per Page 2) ---")
# This is generally the preferred method in a Data Science workflow
dummy_features = pd.get_dummies(df[NOMINAL_FEATURE], prefix=NOMINAL_FEATURE, dtype=int)
print("First 5 rows of pandas get_dummies output:")
print(dummy_features.head())


--- Encoding Nominal Categorical Feature: LabelBinarizer (zipcode) ---
Shape of encoded array: (100, 49)
First 5 rows of one-hot encoded array:
[[0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
  0 0 0 0 0 0 0 0 0 0 1 0 0]
 [0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
  0 0 1 0 0 0 0 0 0 0 0 0 0]
 [0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
  0 0 0 0 0 0 0 0 0 0 0 0 0]
 [0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
  0 0 0 0 0 1 0 0 0 0 0 0 0]
 [0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0
  0 0 0 0 0 0 0 0 0 0 0 0 0]]

--- Using pandas get_dummies (as per Page 2) ---
First 5 rows of pandas get_dummies output:
   zipcode_98001  zipcode_98002  zipcode_98003  zipcode_98004  zipcode_98005  \
0              0              0              0              0              0   
1              0              0              0              0              0   
2     

# 5.2 Encoding Ordinal Categorical Features (Using DataFrame replace method)

In [4]:
print(f"\n--- 5.2 Encoding Ordinal Categorical Features: {ORDINAL_FEATURE} ---")

# The 'grade' feature is already numerical (1-13) but can be re-mapped for demonstration
# We will treat 1-5 as 'Low', 6-8 as 'Medium', 9-13 as 'High' for this example
df_ordinal = df[[ORDINAL_FEATURE]].copy()

# Create mapper
grade_mapper = {
    g: 'Low' if g <= 5 else ('Medium' if g <= 8 else 'High') 
    for g in df_ordinal[ORDINAL_FEATURE].unique()
}
df_ordinal['Grade_Text'] = df_ordinal[ORDINAL_FEATURE].replace(grade_mapper)

# Define the ordinal numerical replacement
scale_mapper = {
    "Low": 1,
    "Medium": 2,
    "High": 3
}

# Replace feature values with scale
df_ordinal['Grade_Ordinal'] = df_ordinal['Grade_Text'].replace(scale_mapper)

print(f"Original {ORDINAL_FEATURE} vs New Ordinal Scale (First 5 rows):")
print(df_ordinal[['Grade_Text', 'Grade_Ordinal']].head())


--- 5.2 Encoding Ordinal Categorical Features: grade ---
Original grade vs New Ordinal Scale (First 5 rows):
  Grade_Text  Grade_Ordinal
0     Medium              2
1     Medium              2
2     Medium              2
3     Medium              2
4     Medium              2


  df_ordinal['Grade_Ordinal'] = df_ordinal['Grade_Text'].replace(scale_mapper)


# Encoding Dictionaries of Features (DictVectorizer)
# (Adaptation uses existing data structured as a dictionary list)

In [5]:
print("\n--- Encoding Dictionaries of Features: DictVectorizer ---")
# Create a sample list of dictionaries from 'price' and 'sqft_living'
data_dict = df[['price', 'sqft_living']].head(5).to_dict('records')

# Create dictionary vectorizer
dictvectorizer = DictVectorizer(sparse=False)

# Convert dictionary to feature matrix
features_dict_vec = dictvectorizer.fit_transform(data_dict)

print("First 5 rows of DictVectorizer output:")
print(features_dict_vec)
print("Feature Names:", dictvectorizer.get_feature_names_out())


--- Encoding Dictionaries of Features: DictVectorizer ---
First 5 rows of DictVectorizer output:
[[221900.   1180.]
 [538000.   2570.]
 [180000.    770.]
 [604000.   1960.]
 [510000.   1680.]]
Feature Names: ['price' 'sqft_living']


# Imputing Missing Class Values: Most Frequent (SimpleImputer)
# (Adaptation uses the 'zipcode' feature after adding an artificial NaN)

In [6]:
print("\n--- Imputing Missing Class Values: Most Frequent (SimpleImputer) ---")

# Create a copy and add a missing value for demonstration
zip_codes_missing = df[NOMINAL_FEATURE].head(10).copy()
zip_codes_missing.loc[3] = np.nan # Introduce a missing value

# Convert to 2D array required by SimpleImputer
zip_codes_array = zip_codes_missing.values.reshape(-1, 1)

# Create imputer (strategy='most_frequent')
imputer = SimpleImputer(missing_values=np.nan, strategy='most_frequent') 

# Impute values
imputed_zip_codes = imputer.fit_transform(zip_codes_array)

print("Original Feature Array (with NaN at index 3):")
print(zip_codes_array.flatten())
print("Imputed Feature Array (NaN replaced by most frequent value):")
print(imputed_zip_codes.flatten())


--- Imputing Missing Class Values: Most Frequent (SimpleImputer) ---
Original Feature Array (with NaN at index 3):
[98178. 98125. 98028.    nan 98074. 98053. 98003. 98198. 98146. 98038.]
Imputed Feature Array (NaN replaced by most frequent value):
[98178. 98125. 98028. 98003. 98074. 98053. 98003. 98198. 98146. 98038.]


# Handling Imbalanced Classes: Class Weighting (RandomForestClassifier)
# Using the highly imbalanced 'waterfront' target (IMBALANCED_TARGET)

In [7]:
print("\n--- Handling Imbalanced Classes: Class Weighting ---")

# Define target and features (using raw features for simplicity)
y_target = df[IMBALANCED_TARGET].values
X_features = df[['sqft_living', 'price']].values

# 1. Calculate the class distribution
class_0_count = np.sum(y_target == 0)
class_1_count = np.sum(y_target == 1)
total_count = len(y_target)

# Display imbalance
print(f"Class 0 (No Waterfront): {class_0_count} ({class_0_count/total_count:.2%})")
print(f"Class 1 (Waterfront): {class_1_count} ({class_1_count/total_count:.2%})")

# 2. Create the classifier with balanced class weights (as per Page 7)
clf_balanced = RandomForestClassifier(class_weight='balanced', random_state=42)

print("\nRandomForestClassifier created using class_weight='balanced'")
# You would then fit this model: clf_balanced.fit(X_features, y_target)

# 3. Demonstration of Downsampling (as per Page 7)

# Identify indices for each class
i_class0 = np.where(y_target == 0)[0]
i_class1 = np.where(y_target == 1)[0]
n_class1 = len(i_class1)

# Downsample the majority class (Class 0) to match the size of the minority class (Class 1)
i_class0_downsampled = np.random.choice(i_class0, size=n_class1, replace=False)

# Create the downsampled target and feature matrices
target_downsampled = np.hstack((y_target[i_class0_downsampled], y_target[i_class1]))
features_downsampled = np.vstack((X_features[i_class0_downsampled, :], X_features[i_class1, :]))

print("\n--- Downsampling (Majority Class) ---")
print(f"New Downsampled Dataset Size: {len(target_downsampled)}")
print(f"New Class 0 Count: {np.sum(target_downsampled == 0)}")
print(f"New Class 1 Count: {np.sum(target_downsampled == 1)}")


--- Handling Imbalanced Classes: Class Weighting ---
Class 0 (No Waterfront): 21450 (99.25%)
Class 1 (Waterfront): 163 (0.75%)

RandomForestClassifier created using class_weight='balanced'

--- Downsampling (Majority Class) ---
New Downsampled Dataset Size: 326
New Class 0 Count: 163
New Class 1 Count: 163
