In [33]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import MinMaxScaler, LabelEncoder

data = {
    'Customer ID': ['C001', 'C002', 'C003', 'C004', 'C005','C006', 'C007', 'C008'],
    'Age': [25, 35, np.nan, 42, 28, 55, 31, 45],
    'Gender': ['Male', 'Female', 'Male', np.nan, 'Female', 'Male', 'Female', 'Male'],
    'Income': [50000, 75000, 60000, np.nan, 45000, 90000, 55000, 80000],
    'City': ['Urban', 'Rural', 'Urban', 'Urban', np.nan, 'Rural', 'Urban', 'Rural'],
    'Subscription Status': ['Subscribed', 'Not Subscribed', 'Subscribed', 'Subscribed', 'Not Subscribed', np.nan, 'Subscribed', 'Not Subscribed']
}

df = pd.DataFrame(data)
print("Original DataFrame:")
print(df)
print("\n" + "="*65 + "\n")

Original DataFrame:
  Customer ID   Age  Gender   Income   City Subscription Status
0        C001  25.0    Male  50000.0  Urban          Subscribed
1        C002  35.0  Female  75000.0  Rural      Not Subscribed
2        C003   NaN    Male  60000.0  Urban          Subscribed
3        C004  42.0     NaN      NaN  Urban          Subscribed
4        C005  28.0  Female  45000.0    NaN      Not Subscribed
5        C006  55.0    Male  90000.0  Rural                 NaN
6        C007  31.0  Female  55000.0  Urban          Subscribed
7        C008  45.0    Male  80000.0  Rural      Not Subscribed




In [34]:
#II. Handle Missing Vlaues

fill_values = {
    'Age': df['Age'].median(),
    'Income': df['Income'].median(),
    'Gender': df['Gender'].mode()[0],
    'City': df['City'].mode()[0],
    'Subscription Status': df['Subscription Status'].mode()[0]
}

df.fillna(fill_values, inplace=True)

print("\nValues Filled :")
print(fill_values)

print("\nAfter handling missing values:")
print(df)
print(f"\nRemaining missing values: {df.isna().sum().sum()}")
print("\n" + "="*65 + "\n")


Values Filled :
{'Age': 35.0, 'Income': 60000.0, 'Gender': 'Male', 'City': 'Urban', 'Subscription Status': 'Subscribed'}

After handling missing values:
  Customer ID   Age  Gender   Income   City Subscription Status
0        C001  25.0    Male  50000.0  Urban          Subscribed
1        C002  35.0  Female  75000.0  Rural      Not Subscribed
2        C003  35.0    Male  60000.0  Urban          Subscribed
3        C004  42.0    Male  60000.0  Urban          Subscribed
4        C005  28.0  Female  45000.0  Urban      Not Subscribed
5        C006  55.0    Male  90000.0  Rural          Subscribed
6        C007  31.0  Female  55000.0  Urban          Subscribed
7        C008  45.0    Male  80000.0  Rural      Not Subscribed

Remaining missing values: 0




In [35]:
## Label Encoding
le_gender = LabelEncoder()
le_city = LabelEncoder()
le_subscription = LabelEncoder()

df['Gender_Encoded'] = le_gender.fit_transform(df['Gender'])
df['City_Encoded'] = le_city.fit_transform(df['City'])
df['Subscription_Encoded'] = le_subscription.fit_transform(df['Subscription Status'])

print("Label Encoding applied:")
print(f"Gender mapping: {dict(zip(le_gender.classes_, le_gender.transform(le_gender.classes_)))}")
print(f"City mapping: {dict(zip(le_city.classes_, le_city.transform(le_city.classes_)))}")
print(f"Subscription mapping: {dict(zip(le_subscription.classes_, le_subscription.transform(le_subscription.classes_)))}")
print("\n")

Label Encoding applied:
Gender mapping: {'Female': np.int64(0), 'Male': np.int64(1)}
City mapping: {'Rural': np.int64(0), 'Urban': np.int64(1)}
Subscription mapping: {'Not Subscribed': np.int64(0), 'Subscribed': np.int64(1)}




In [36]:
# III. Feature Scaling
print("III. FEATURE SCALING (MinMaxScaler)")
print("-" * 40)

# Create a copy for scaling
df_scaled = df.copy()

# Initialize MinMaxScaler
scaler = MinMaxScaler()

# Scale Age and Income
numerical_cols = ['Age', 'Income']
df_scaled[['Age_Scaled', 'Income_Scaled']] = scaler.fit_transform(df[numerical_cols])

print("Before scaling:")
print(df[['Customer ID', 'Age', 'Income']])
print("\nAfter scaling (0-1 range):")
print(df_scaled[['Customer ID', 'Age_Scaled', 'Income_Scaled']])
print("\n" + "="*65 + "\n")

III. FEATURE SCALING (MinMaxScaler)
----------------------------------------
Before scaling:
  Customer ID   Age   Income
0        C001  25.0  50000.0
1        C002  35.0  75000.0
2        C003  35.0  60000.0
3        C004  42.0  60000.0
4        C005  28.0  45000.0
5        C006  55.0  90000.0
6        C007  31.0  55000.0
7        C008  45.0  80000.0

After scaling (0-1 range):
  Customer ID  Age_Scaled  Income_Scaled
0        C001    0.000000       0.111111
1        C002    0.333333       0.666667
2        C003    0.333333       0.333333
3        C004    0.566667       0.333333
4        C005    0.100000       0.000000
5        C006    1.000000       1.000000
6        C007    0.200000       0.222222
7        C008    0.666667       0.777778




In [37]:
# Final DataFrame with all preprocessing

# Set display width to show all columns
pd.set_option('display.width', 1000)
pd.set_option('display.max_columns', None)


print("FINAL PREPROCESSED DATAFRAME:")
print("-" * 40)
final_df = df_scaled[['Customer ID', 'Age_Scaled', 'Income_Scaled', 
                      'Gender_Encoded', 'City_Encoded', 'Subscription_Encoded']]
print(final_df)
print("\n")

# Summary statistics
print("Summary Statistics:")
print(final_df.describe())

FINAL PREPROCESSED DATAFRAME:
----------------------------------------
  Customer ID  Age_Scaled  Income_Scaled  Gender_Encoded  City_Encoded  Subscription_Encoded
0        C001    0.000000       0.111111               1             1                     1
1        C002    0.333333       0.666667               0             0                     0
2        C003    0.333333       0.333333               1             1                     1
3        C004    0.566667       0.333333               1             1                     1
4        C005    0.100000       0.000000               0             1                     0
5        C006    1.000000       1.000000               1             0                     1
6        C007    0.200000       0.222222               0             1                     1
7        C008    0.666667       0.777778               1             0                     0


Summary Statistics:
       Age_Scaled  Income_Scaled  Gender_Encoded  City_Encoded  Subscr