In [1]:
import pandas as pd
import numpy as np
np.random.seed(42)
n = 300
customer_id = np.arange(1, n+1)
age = np.random.randint(18, 70, size=n).astype(float)
age[np.random.choice(n, 20, replace=False)] = np.nan
genders = ['Male', 'Female', 'Other']
gender = np.random.choice(genders, size=n, p=[0.45, 0.45, 0.1])
gender[np.random.choice(n, 15, replace=False)] = np.nan
income = np.random.normal(loc=60, scale=20, size=n).round(2)
income[income < 10] = 10
income[np.random.choice(n, 25, replace=False)] = np.nan
purchased = np.random.choice(['Yes', 'No'], size=n, p=[0.4, 0.6])
cities = ['New York', 'Los Angeles', 'Chicago', 'Houston', 'Phoenix']
city = np.random.choice(cities, size=n, p=[0.3, 0.25, 0.2, 0.15, 0.1])
membership_years = np.abs(np.random.normal(loc=5, scale=3, size=n)).round(1)
credit_score = np.random.normal(loc=650, scale=50, size=n).round()
credit_score[credit_score < 300] = 300
credit_score[credit_score > 850] = 850
credit_score[np.random.choice(n, 30, replace=False)] = np.nan
account_balance = np.random.normal(loc=5000, scale=2000, size=n).round(2)
outliers_idx = np.random.choice(n, 5, replace=False)
account_balance[outliers_idx] = account_balance[outliers_idx] * 5
df = pd.DataFrame({
    'CustomerID': customer_id,
    'Age': age,
    'Gender': gender,
    'Annual Income (k$)': income,
    'Purchased': purchased,
    'City': city,
    'Membership Years': membership_years,
    'Credit Score': credit_score,
    'Account Balance': account_balance
})

df.head()

Unnamed: 0,CustomerID,Age,Gender,Annual Income (k$),Purchased,City,Membership Years,Credit Score,Account Balance
0,1,56.0,Male,65.95,Yes,Phoenix,5.2,699.0,5307.66
1,2,69.0,Other,72.46,Yes,Chicago,8.1,712.0,6532.93
2,3,46.0,Female,79.0,No,Los Angeles,3.6,671.0,1564.41
3,4,32.0,Female,70.79,No,Houston,4.1,633.0,5569.74
4,5,60.0,Female,86.93,No,New York,5.7,570.0,33103.45


In [2]:
df.sample(10)

Unnamed: 0,CustomerID,Age,Gender,Annual Income (k$),Purchased,City,Membership Years,Credit Score,Account Balance
126,127,51.0,Female,51.68,No,New York,8.2,,5992.33
94,95,43.0,Female,65.12,Yes,New York,4.4,604.0,4328.54
255,256,33.0,Male,60.95,No,New York,2.5,,7342.24
63,64,31.0,Male,52.35,Yes,New York,11.8,610.0,6487.63
129,130,40.0,Male,83.88,Yes,Los Angeles,6.6,659.0,3880.52
52,53,26.0,Female,73.27,Yes,Houston,4.6,613.0,5835.35
6,7,38.0,Male,,No,Chicago,7.6,627.0,7213.93
156,157,68.0,Female,60.87,No,Los Angeles,4.4,642.0,3558.72
231,232,40.0,Female,60.47,Yes,Chicago,6.4,629.0,6259.5
136,137,44.0,Male,63.39,Yes,New York,7.8,,2491.4


In [3]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 300 entries, 0 to 299
Data columns (total 9 columns):
 #   Column              Non-Null Count  Dtype  
---  ------              --------------  -----  
 0   CustomerID          300 non-null    int64  
 1   Age                 280 non-null    float64
 2   Gender              300 non-null    object 
 3   Annual Income (k$)  275 non-null    float64
 4   Purchased           300 non-null    object 
 5   City                300 non-null    object 
 6   Membership Years    300 non-null    float64
 7   Credit Score        270 non-null    float64
 8   Account Balance     300 non-null    float64
dtypes: float64(5), int64(1), object(3)
memory usage: 21.2+ KB


In [4]:
df.isnull().sum()

Unnamed: 0,0
CustomerID,0
Age,20
Gender,0
Annual Income (k$),25
Purchased,0
City,0
Membership Years,0
Credit Score,30
Account Balance,0


In [7]:
import pandas as pd
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder, LabelEncoder, StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
num_cols = ['Age', 'Annual Income (k$)', 'Membership Years', 'Credit Score', 'Account Balance']
cat_cols = ['Gender', 'Purchased', 'City']
num_imputer = SimpleImputer(strategy='mean')
cat_imputer = SimpleImputer(strategy='most_frequent')
df[num_cols] = num_imputer.fit_transform(df[num_cols])
df[cat_cols] = cat_imputer.fit_transform(df[cat_cols])

In [8]:
le_purchased = LabelEncoder()
df['Purchased'] = le_purchased.fit_transform(df['Purchased'])
ohe = OneHotEncoder(drop='first', sparse_output=False)
ohe_features = ohe.fit_transform(df[['Gender', 'City']])
ohe_feature_names = ohe.get_feature_names_out(['Gender', 'City'])
df_ohe = pd.DataFrame(ohe_features, columns=ohe_feature_names, index=df.index)
df = pd.concat([df.drop(['Gender', 'City'], axis=1), df_ohe], axis=1)

In [10]:
scaler = StandardScaler()
df[num_cols] = scaler.fit_transform(df[num_cols])
for col in ['Account Balance']:
    lower = df[col].quantile(0.01)
    upper = df[col].quantile(0.99)
    df[col] = df[col].clip(lower, upper)

In [11]:
df.head()

Unnamed: 0,CustomerID,Age,Annual Income (k$),Purchased,Membership Years,Credit Score,Account Balance,Gender_Male,Gender_Other,Gender_nan,City_Houston,City_Los Angeles,City_New York,City_Phoenix
0,1,0.826604,0.230346,1,-0.048566,0.965262,-0.009634,1.0,0.0,0.0,0.0,0.0,0.0,1.0
1,2,1.706382,0.582667,1,0.938634,1.245129,0.491834,0.0,1.0,0.0,0.0,0.0,0.0,0.0
2,3,0.149852,0.936612,0,-0.593227,0.362472,-1.541639,0.0,0.0,0.0,0.0,1.0,0.0,0.0
3,4,-0.797601,0.492287,0,-0.423021,-0.455601,0.097628,0.0,0.0,0.0,1.0,0.0,0.0,0.0
4,5,1.097305,1.365783,0,0.121641,-1.811879,4.947448,0.0,0.0,0.0,0.0,0.0,1.0,0.0


In [12]:
df.isnull().sum()

Unnamed: 0,0
CustomerID,0
Age,0
Annual Income (k$),0
Purchased,0
Membership Years,0
Credit Score,0
Account Balance,0
Gender_Male,0
Gender_Other,0
Gender_nan,0
