In [2]:
%matplotlib inline
import matplotlib.pyplot as plt
import numpy as np
import warnings
warnings.simplefilter("ignore", DeprecationWarning)

# -----------------------------
# Simple Example (Manual Standardization)
# -----------------------------
# Create a numpy array of 5 integers
arr = np.array([10, 20, 30, 40, 50])

# Normalize manually (Z-score)
arr_stand = (arr - arr.mean()) / arr.std()
print("Manual Standardization:\n", arr_stand)


# -----------------------------
# Standard Scaler
# -----------------------------
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()
scaled_arr = scaler.fit_transform(arr.reshape(-1, 1))
print("\nStandardScaler Output:\n", scaled_arr)


# -----------------------------
# MinMax Scaler
# -----------------------------
from sklearn.preprocessing import MinMaxScaler

min_max = MinMaxScaler()
scaled_minmax = min_max.fit_transform(arr.reshape(-1, 1))
print("\nMinMaxScaler Output:\n", scaled_minmax)


# -----------------------------
# Create Dataset
# -----------------------------
import pandas as pd

data = {
    'name': ['Rex', 'Peter', 'Sam', 'Rita', 'Hilda'],
    'age': [1, 95, 22, 0, 60],
    'height': [20, 180, 160, 145, 125],
    'weight': [0, 100, 75, 55, 40],
    'bp': [80, 100, 65, 70, 40]
}

frame = pd.DataFrame(data)
print("\nOriginal DataFrame:\n", frame)

# Save to CSV
frame.to_csv('patient_data.csv', index=False)

# Read CSV
df = pd.read_csv('patient_data.csv')
print("\nLoaded DataFrame:\n", df)


# -----------------------------
# Replace zeros with column mean (numeric columns only)
# -----------------------------
numeric_cols = df.select_dtypes(include=np.number).columns
df[numeric_cols] = df[numeric_cols].replace(0, np.nan)
df[numeric_cols] = df[numeric_cols].fillna(df[numeric_cols].mean())

print("\nData after replacing zeros with mean:\n", df)


# -----------------------------
# Select useful numeric columns
# -----------------------------
frame2 = df[['age', 'bp', 'height', 'weight']]

# -----------------------------
# Apply StandardScaler
# -----------------------------
scaler = StandardScaler()
scaled = scaler.fit_transform(frame2)

print("\nStandardScaler Result:\n", scaled)
print("Shape:", scaled.shape)
print("Mean:", scaled.mean(axis=0))
print("Std Dev:", scaled.std(axis=0))


# -----------------------------
# Apply MinMaxScaler
# -----------------------------
min_max = MinMaxScaler()
scaled2 = min_max.fit_transform(frame2)

print("\nMinMaxScaler Result:\n", scaled2)
print("Mean:", scaled2.mean(axis=0))
print("Std Dev:", scaled2.std(axis=0))


# -----------------------------
# Feature Selection
# -----------------------------
from sklearn.feature_selection import VarianceThreshold

# Threshold = 0.5 * (1 - 0.9)
sel = VarianceThreshold(threshold=(.5 * (1 - .9)))

useful_features1 = sel.fit_transform(scaled)
print('\nUseful Features (StandardScaler):\n', useful_features1)

useful_features2 = sel.fit_transform(scaled2)
print('\nUseful Features (MinMaxScaler):\n', useful_features2)


Manual Standardization:
 [-1.41421356 -0.70710678  0.          0.70710678  1.41421356]

StandardScaler Output:
 [[-1.41421356]
 [-0.70710678]
 [ 0.        ]
 [ 0.70710678]
 [ 1.41421356]]

MinMaxScaler Output:
 [[0.  ]
 [0.25]
 [0.5 ]
 [0.75]
 [1.  ]]

Original DataFrame:
     name  age  height  weight   bp
0    Rex    1      20       0   80
1  Peter   95     180     100  100
2    Sam   22     160      75   65
3   Rita    0     145      55   70
4  Hilda   60     125      40   40

Loaded DataFrame:
     name  age  height  weight   bp
0    Rex    1      20       0   80
1  Peter   95     180     100  100
2    Sam   22     160      75   65
3   Rita    0     145      55   70
4  Hilda   60     125      40   40

Data after replacing zeros with mean:
     name   age  height  weight   bp
0    Rex   1.0      20    67.5   80
1  Peter  95.0     180   100.0  100
2    Sam  22.0     160    75.0   65
3   Rita  44.5     145    55.0   70
4  Hilda  60.0     125    40.0   40

StandardScaler Result:
 [[-1.