In [None]:
# Install if needed (optional in Colab)
!pip install pandas scikit-learn

# Import necessary libraries
import pandas as pd
from sklearn.preprocessing import StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
import numpy as np




In [None]:
# Upload CSV file from local system
from google.colab import files
uploaded = files.upload()

# Load the dataset
df = pd.read_csv('sample_customer_data.csv')
df.head()


Saving sample_customer_data.csv to sample_customer_data.csv


Unnamed: 0,CustomerID,Name,Age,Salary,Gender,City
0,1001,Alice,25.0,50000.0,Female,Delhi
1,1002,Bob,30.0,60000.0,Male,Mumbai
2,1003,Charlie,,55000.0,Male,Delhi
3,1004,David,45.0,65000.0,Male,Bangalore
4,1005,Eva,35.0,70000.0,Female,Mumbai


In [None]:
# Identify column types
num_cols = df.select_dtypes(include=["number"]).columns.tolist()
cat_cols = df.select_dtypes(exclude=["number"]).columns.tolist()

# Fill missing numerical values with median
df[num_cols] = df[num_cols].fillna(df[num_cols].median())

# Fill missing categorical values with mode
df[cat_cols] = df[cat_cols].fillna(df[cat_cols].mode().iloc[0])

# Build transformation pipeline for numerical columns
numeric_transformer = Pipeline(steps=[("scaler", StandardScaler())])
preprocessor = ColumnTransformer(
    transformers=[("num", numeric_transformer, num_cols)],
    remainder="passthrough"
)

# Apply transformation
df_transformed = preprocessor.fit_transform(df)

# Create DataFrame for scaled numerical data
df_numeric_scaled = pd.DataFrame(df_transformed[:, :len(num_cols)], columns=num_cols)

# One-hot encode categorical columns
df_cat_encoded = pd.get_dummies(df[cat_cols], drop_first=True)

# Combine numerical + encoded categorical data
df_processed = pd.concat([df_numeric_scaled, df_cat_encoded], axis=1)

# Show sample of processed data
df_processed.head()


Unnamed: 0,CustomerID,Age,Salary,Name_Bob,Name_Charlie,Name_David,Name_Eva,Name_Frank,Name_Grace,Name_Hannah,Name_Ian,Name_Julia,Gender_Male,City_Chennai,City_Delhi,City_Mumbai
0,-1.566699,-1.428784,-1.890482,False,False,False,False,False,False,False,False,False,False,False,True,False
1,-1.218544,-0.754829,-0.288379,True,False,False,False,False,False,False,False,False,True,False,False,True
2,-0.870388,-0.215666,-1.08943,False,True,False,False,False,False,False,False,False,True,False,True,False
3,-0.522233,1.267035,0.512673,False,False,True,False,False,False,False,False,False,True,False,False,False
4,-0.174078,-0.080875,1.313725,False,False,False,True,False,False,False,False,False,False,False,False,True


In [None]:
# Save the final cleaned data to CSV
df_processed.to_csv('processed_customer_data.csv', index=False)
print("✅ File saved as processed_customer_data.csv")

# Optionally download it
files.download('processed_customer_data.csv')


✅ File saved as processed_customer_data.csv


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>