## **SHETH L.U.J. & SIR M.V. COLLEGE**

**Rajanish bhardwaj | T073**
###Practical No. 3
**Aim:** Feature Scaling and Dummification
* Apply feature-scaling techniques like standardization and normalization to numerical features.
* Perform feature dummification to convert categorical variables into numerical
representations.

### **Part 1: Handling Numerical Data**

**1: Import Libraries and Load Data**

In [None]:
import pandas as pd
import numpy as np
from sklearn import preprocessing
from sklearn.preprocessing import Normalizer
from sklearn.cluster import KMeans
from sklearn.impute import SimpleImputer

# Load the dataset
df = pd.read_csv('heart.csv')
print("Original Data Head:")
print(df.head())


Original Data Head:
    age  sex   cp  trestbps   chol  fbs  restecg  thalach  exang  oldpeak  \
0  52.0  1.0  0.0     125.0  212.0  0.0      1.0    168.0    0.0      1.0   
1  53.0  1.0  0.0     140.0  203.0  1.0      0.0    155.0    1.0      3.1   
2  70.0  1.0  0.0     145.0  174.0  0.0      1.0    125.0    1.0      2.6   
3  61.0  1.0  0.0     148.0  203.0  0.0      1.0    161.0    0.0      0.0   
4  62.0  0.0  0.0     138.0  294.0  1.0      1.0    106.0    0.0      1.9   

   slope   ca  thal  target  
0    2.0  2.0   3.0     0.0  
1    0.0  0.0   3.0     0.0  
2    NaN  0.0   3.0     0.0  
3    2.0  1.0   3.0     0.0  
4    1.0  3.0   2.0     0.0  


2: Rescaling a Feature (MinMax Scaling)

In [None]:
# We will rescale 'age' to be between 0 and 1
feature_age = df[['age']].values
minmax_scaler = preprocessing.MinMaxScaler(feature_range=(0,1))
scaled_age = minmax_scaler.fit_transform(feature_age)

print("Scaled Age (First 5 values):")
print(scaled_age[:5].flatten())

Scaled Age (First 5 values):
[0.47916667 0.5        0.85416667 0.66666667 0.6875    ]


3: Standardizing a Feature (Z-Score & Robust)

In [None]:
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler, RobustScaler

# Step 1: Extract feature
feature_chol = df[['chol']].values

# Step 2: Impute missing values
imputer = SimpleImputer(strategy="mean")
chol_imputed = imputer.fit_transform(feature_chol)

# Step 3: Standardize (Mean = 0, Std = 1)
scaler = StandardScaler()
standardized_chol = scaler.fit_transform(chol_imputed)

print("Standardized Cholesterol (Mean and Std):")
print(f"Mean: {standardized_chol.mean():.6f}")
print(f"Std:  {standardized_chol.std():.6f}")

# Step 4: Robust Scaling (Less sensitive to outliers)
robust_scaler = RobustScaler()
robust_chol = robust_scaler.fit_transform(chol_imputed)

print("\nRobust Scaled Cholesterol (First 5):")
print(robust_chol[:5].flatten())



Standardized Cholesterol (Mean and Std):
Mean: -0.000000
Std:  1.000000

Robust Scaled Cholesterol (First 5):
[-0.5        -0.64516129 -1.11290323 -0.64516129  0.82258065]


4: Normalizing Observations

In [None]:
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import Normalizer
import numpy as np

# Extract age and chol
features_norm = df[['age', 'chol']].values

# Step 1: Impute missing values
imputer = SimpleImputer(strategy="mean")
features_imputed = imputer.fit_transform(features_norm)

# Step 2: Normalize using L2 norm
normalizer = Normalizer(norm="l2")
normalized_features = normalizer.transform(features_imputed)

print("Normalized Age & Chol (First 5 rows):")
print(normalized_features[:5])



Normalized Age & Chol (First 5 rows):
[[0.23822153 0.97121084]
 [0.25261592 0.96756664]
 [0.37322851 0.92773944]
 [0.28778067 0.95769634]
 [0.20634593 0.9784791 ]]


5: Grouping Observations Using Clustering

In [None]:
# Group patients into 3 clusters based on 'age' and 'chol'
# --- 2) Impute missing values in age and chol ---
features_cluster = df[['age', 'chol']].values
imputer = SimpleImputer(strategy="mean")
features_imputed = imputer.fit_transform(features_cluster)
# --- 3) Scale the features so clusters aren't biased by magnitude ---
scaler = StandardScaler()
features_scaled = scaler.fit_transform(features_imputed)

# --- 4) Run KMeans ---
clusterer = KMeans(n_clusters=3, random_state=0)
labels = clusterer.fit_predict(features_scaled)
# attach cluster labels to dataframe
df['cluster_group'] = labels

# --- 5) Inspect results ---
print("Clustered Groups (First 5 rows):")
print(df[['age', 'chol', 'cluster_group']].head())



Clustered Groups (First 5 rows):
    age   chol  cluster_group
0  52.0  212.0              1
1  53.0  203.0              1
2  70.0  174.0              1
3  61.0  203.0              1
4  62.0  294.0              2


6: Deleteing Observations with Missing Values

In [None]:
df_clean = df.dropna()

print(df_clean.head())
print("Rows before:", len(df))
print("Rows after dropna:", len(df_clean))


    age  sex   cp  trestbps   chol  fbs  restecg  thalach  exang  oldpeak  \
0  52.0  1.0  0.0     125.0  212.0  0.0      1.0    168.0    0.0      1.0   
1  53.0  1.0  0.0     140.0  203.0  1.0      0.0    155.0    1.0      3.1   
3  61.0  1.0  0.0     148.0  203.0  0.0      1.0    161.0    0.0      0.0   
4  62.0  0.0  0.0     138.0  294.0  1.0      1.0    106.0    0.0      1.9   
6  58.0  1.0  0.0     114.0  318.0  0.0      2.0    140.0    0.0      4.4   

   slope   ca  thal  target  cluster_group  
0    2.0  2.0   3.0     0.0              1  
1    0.0  0.0   3.0     0.0              1  
3    2.0  1.0   3.0     0.0              1  
4    1.0  3.0   2.0     0.0              2  
6    0.0  3.0   1.0     0.0              2  
Rows before: 1025
Rows after dropna: 498


7: Imputing Missing Values

In [None]:
import numpy as np
from sklearn.impute import SimpleImputer

# create missing values in chol
df_missing = df.copy()
df_missing.loc[0:10, 'chol'] = np.nan

# imputer
imputer = SimpleImputer(strategy='mean')

df_imputed = df_missing.copy()
df_imputed[['chol']] = imputer.fit_transform(df_missing[['chol']])

print("Before imputation:", df_missing['chol'].head(12).values)
print("After imputation:", df_imputed['chol'].head(12).values)




Before imputation: [ nan  nan  nan  nan  nan  nan  nan  nan  nan  nan  nan 341.]
After imputation: [246.0814433 246.0814433 246.0814433 246.0814433 246.0814433 246.0814433
 246.0814433 246.0814433 246.0814433 246.0814433 246.0814433 341.       ]


# Part 2: Handling Categorical Data & Imbalanced Classes

7: Imports for Categorical Data

In [None]:
from sklearn.preprocessing import LabelBinarizer
from sklearn.feature_extraction import DictVectorizer
from sklearn.neighbors import KNeighborsClassifier

# Reload dataset to ensure clean slate for Part 2
df = pd.read_csv('heart.csv')

8: Encoding Nominal Categorical Features

In [None]:
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import LabelBinarizer

# Impute missing cp values with the most frequent (mode)
imputer = SimpleImputer(strategy="most_frequent")
cp_imputed = imputer.fit_transform(df[['cp']]).ravel()   # shape (n,)

# One-hot / binarize
one_hot = LabelBinarizer()
cp_encoded = one_hot.fit_transform(cp_imputed)

print("One-Hot Encoded 'cp' (First 5 rows):")
print(cp_encoded[:5])
print("Classes:", one_hot.classes_)



One-Hot Encoded 'cp' (First 5 rows):
[[1 0 0 0]
 [1 0 0 0]
 [1 0 0 0]
 [1 0 0 0]
 [1 0 0 0]]
Classes: [0. 1. 2. 3.]


9: Encoding Dictionaries of Features

In [None]:
from sklearn.feature_extraction import DictVectorizer
import pandas as pd

# Select two categorical columns from the heart dataset
data_dict = df[['cp', 'thal']].to_dict(orient='records')

dictvectorizer = DictVectorizer(sparse=False)
features_dict = dictvectorizer.fit_transform(data_dict)

print("Dictionary Vectorized Features (First row):")
print(features_dict[0])

print("Feature Names:", dictvectorizer.get_feature_names_out())


Dictionary Vectorized Features (First row):
[0. 3.]
Feature Names: ['cp' 'thal']


10: Encoding Ordinal Categorical Features (Binning)



In [None]:
import pandas as pd

# Step 1: Bin 'age' into groups (Young, Middle, Senior)
df['age_group'] = pd.cut(
    df['age'],
    bins=[0, 30, 55, 100],
    labels=["Young", "Middle", "Senior"]
)

# Step 2: Map labels to numeric values
scale_mapper = {"Young": 1, "Middle": 2, "Senior": 3}
df['age_group_encoded'] = df['age_group'].map(scale_mapper)

print("Binned and Encoded Age (First 5 rows):")
print(df[['age', 'age_group', 'age_group_encoded']].head())


Binned and Encoded Age (First 5 rows):
    age age_group age_group_encoded
0  52.0    Middle                 2
1  53.0    Middle                 2
2  70.0    Senior                 3
3  61.0    Senior                 3
4  62.0    Senior                 3


11: Imputing Missing Class Values (using KNN)

In [None]:
import numpy as np
from sklearn.impute import SimpleImputer
from sklearn.neighbors import KNeighborsRegressor

# Step 1 — Columns used to predict age
features = ['chol', 'trestbps']

# Step 2 — Impute missing values in predictor columns
imputer = SimpleImputer(strategy='mean')
df[features] = imputer.fit_transform(df[features])

# Step 3 — Create TRAIN rows (age not missing)
train_df = df[df['age'].notna()]
X_train = train_df[features].values
y_train = train_df['age'].values

# Step 4 — Create TEST rows (age missing)
test_df = df[df['age'].isna()]
X_test = test_df[features].values   # now safe — predictors have no NaN

# Step 5 — Train KNN
knn = KNeighborsRegressor(n_neighbors=3, weights='distance')
knn.fit(X_train, y_train)

# Step 6 — Predict missing age
predicted_age = knn.predict(X_test)

# Step 7 — Fill back into dataframe
df.loc[df['age'].isna(), 'age'] = predicted_age

print("Predicted missing age values:", predicted_age)


Predicted missing age values: [71.         59.         63.         67.         56.         65.
 43.         52.         40.         43.         56.         42.
 38.         57.         51.         71.         52.         58.
 70.         37.         60.         58.         54.         59.
 55.         64.         58.         47.         60.         54.66666667
 41.         44.         49.         48.         44.         63.
 43.         54.         46.         63.         61.         62.
 70.         68.         59.         50.         54.6278083  42.
 42.         54.6278083  43.         56.         50.         64.
 58.         47.        ]


**12: Handling Imbalanced Classes**

In [None]:
import numpy as np

print("Target Distribution (Original):")
print(df['target'].value_counts())

# Identify majority and minority classes
majority_class = 1   # 11 samples
minority_class = 0   # 9 samples

i_majority = np.where(df['target'] == majority_class)[0]
i_minority = np.where(df['target'] == minority_class)[0]

n_minority = len(i_minority)

# Downsample the majority class
downsampled_majority_indices = np.random.choice(i_majority, size=n_minority, replace=False)

# Combine
final_indices = np.hstack((i_minority, downsampled_majority_indices))

df_balanced = df.iloc[final_indices]

print("\nBalanced Target Distribution (After Downsampling):")
print(df_balanced['target'].value_counts())

Target Distribution (Original):
target
1.0    502
0.0    471
Name: count, dtype: int64

Balanced Target Distribution (After Downsampling):
target
0.0    471
1.0    471
Name: count, dtype: int64
