In [17]:
import pandas as pd
import numpy as np
import re
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import MinMaxScaler, StandardScaler, OneHotEncoder
from sklearn.metrics.pairwise import cosine_similarity

# Part I: Feature Selection

In [23]:
customers = pd.read_csv("archive/AWCustomers.csv")
sales = pd.read_csv("archive/AWSales.csv")
test_class = pd.read_csv("archive/AWTest-Classification.csv")
test_reg = pd.read_csv("archive/AWTest-Regression.csv")



print("Customers shape:", customers.shape)
print("Sales shape:", sales.shape)

Customers shape: (18361, 24)
Sales shape: (18355, 3)


In [25]:
# --- Part I: Feature Selection ---
# Choose meaningful features
selected_features = [
    "CustomerID", "YearlyIncome", "TotalChildren", "NumberChildrenAtHome",
    "Age", "Gender", "MaritalStatus", "HomeOwnerFlag",
    "NumberCarsOwned", "Occupation", "Education", "BikeBuyer"  # BikeBuyer = target
]

# Keep only those available
features = [f for f in selected_features if f in customers.columns]
df = customers[features].copy()
print("\nSelected Features:", features)


Selected Features: ['CustomerID', 'YearlyIncome', 'TotalChildren', 'NumberChildrenAtHome', 'Gender', 'MaritalStatus', 'HomeOwnerFlag', 'NumberCarsOwned', 'Occupation', 'Education']


In [27]:
def attribute_type(series):
    if pd.api.types.is_numeric_dtype(series):
        nunique = series.nunique()
        if nunique < 20:
            return "Discrete (numeric, Ratio)"
        else:
            return "Continuous (numeric, Ratio)"
    else:
        return "Categorical (Nominal)"
    
print("\nAttribute Types:")
for col in df.columns:
    print(f"{col}: {attribute_type(df[col])}")


Attribute Types:
CustomerID: Continuous (numeric, Ratio)
YearlyIncome: Continuous (numeric, Ratio)
TotalChildren: Discrete (numeric, Ratio)
NumberChildrenAtHome: Discrete (numeric, Ratio)
Gender: Categorical (Nominal)
MaritalStatus: Categorical (Nominal)
HomeOwnerFlag: Discrete (numeric, Ratio)
NumberCarsOwned: Discrete (numeric, Ratio)
Occupation: Categorical (Nominal)
Education: Categorical (Nominal)


# Part II: Preprocessing

In [34]:
# (a) Handle Nulls
num_cols = df.select_dtypes(include=[np.number]).columns
cat_cols = df.select_dtypes(exclude=[np.number]).columns

df[num_cols] = SimpleImputer(strategy="median").fit_transform(df[num_cols])
for c in cat_cols:
    df[c] = df[c].fillna(df[c].mode()[0])


In [36]:
# (b) Normalization (Min-Max)
df_norm = df.copy()
df_norm[num_cols] = MinMaxScaler().fit_transform(df[num_cols])


In [38]:
# (c) Discretization: binning YearlyIncome
if "YearlyIncome" in df_norm.columns:
    df_norm["YearlyIncome_Bin"] = pd.qcut(df_norm["YearlyIncome"], 4, labels=False)


In [40]:
# (d) Standardization (Z-score)
df_std = df_norm.copy()
df_std[num_cols] = StandardScaler().fit_transform(df_std[num_cols])

In [53]:
# (e) One-Hot Encoding categorical
from sklearn.preprocessing import OneHotEncoder

# OneHotEncoder (new sklearn uses sparse_output instead of sparse)
ohe = OneHotEncoder(sparse_output=False, drop=None, handle_unknown="ignore")

# Fit + transform only categorical columns
ohe_data = ohe.fit_transform(df_std[cat_cols])

# Convert to DataFrame with same index for alignment
ohe_df = pd.DataFrame(
    ohe_data,
    columns=ohe.get_feature_names_out(cat_cols),
    index=df_std.index
)

# Concatenate back with numeric (standardized) columns
df_final = pd.concat([df_std.drop(columns=cat_cols), ohe_df], axis=1)

print("\nFinal transformed data shape:", df_final.shape)
print(df_final.head())


Final transformed data shape: (18361, 21)
   CustomerID  YearlyIncome  TotalChildren  NumberChildrenAtHome  \
0    0.174472      0.298555       0.161342             -0.594371   
1   -1.310484      0.271180       1.239753              1.163279   
2    1.706839      0.444261      -0.917069             -0.594371   
3   -1.262884     -0.367401       1.239753              1.163279   
4    0.479933     -0.682765      -0.917069             -0.594371   

   HomeOwnerFlag  NumberCarsOwned  YearlyIncome_Bin  Gender_F  Gender_M  \
0       0.798603         1.892524                 2       0.0       1.0   
1       0.798603         0.798389                 2       0.0       1.0   
2      -1.252187         1.892524                 2       1.0       0.0   
3       0.798603         0.798389                 1       0.0       1.0   
4       0.798603        -0.295746                 0       0.0       1.0   

   MaritalStatus_M  ...  Occupation_Clerical  Occupation_Management  \
0              1.0  ...   

# Part III: Similarity & Correlation

In [56]:
# Pick two customers for similarity (row 0 and 1)
a, b = df_final.iloc[0].values, df_final.iloc[1].values

# Simple Matching
simple_match = np.mean(a == b)

# Jaccard (binary)
a_bin, b_bin = (a > 0).astype(int), (b > 0).astype(int)
jaccard = np.sum((a_bin & b_bin)) / np.sum((a_bin | b_bin))

# Cosine
cosine = cosine_similarity([a], [b])[0,0]

print("\nSimilarity between row 0 and row 1:")
print("Simple Matching:", round(simple_match, 4))
print("Jaccard:", round(jaccard, 4))
print("Cosine:", round(cosine, 4))


Similarity between row 0 and row 1:
Simple Matching: 0.6667
Jaccard: 0.6667
Cosine: 0.6387
