In [14]:
import pandas as pd
from sklearn.preprocessing import MinMaxScaler, StandardScaler
from sklearn.metrics.pairwise import cosine_similarity
from scipy.spatial.distance import jaccard
import numpy as np

scaler = MinMaxScaler()
std_scaler = StandardScaler()
YEARLY_INCOME_COL = "YearlyIncome"

# Load
df = pd.read_csv("AWCustomers.csv")
print(df.columns)

# Select features (make sure to copy to avoid SettingWithCopyWarning)
selected = [
    "Occupation", "Education", "Gender", "MaritalStatus", "HomeOwnerFlag",
    "NumberCarsOwned", "TotalChildren", YEARLY_INCOME_COL
]
df_selected = df[selected].copy()

# Handle nulls
df_selected[YEARLY_INCOME_COL] = df_selected[YEARLY_INCOME_COL].fillna(df_selected[YEARLY_INCOME_COL].median())

# Scale income
df_selected.loc[:, [YEARLY_INCOME_COL]] = scaler.fit_transform(df_selected[[YEARLY_INCOME_COL]])
df_selected.loc[:, [YEARLY_INCOME_COL]] = std_scaler.fit_transform(df_selected[[YEARLY_INCOME_COL]])

# One-hot encoding
df_final = pd.get_dummies(df_selected,
                          columns=["Gender", "MaritalStatus", "Education",
                                   "Occupation", "HomeOwnerFlag"])

# Similarity measures
obj1 = df_final.iloc[0].values.reshape(1,-1)
obj2 = df_final.iloc[1].values.reshape(1,-1)

cos_sim = cosine_similarity(obj1, obj2)
print("Cosine similarity:", cos_sim)

smc = np.sum(obj1 == obj2) / len(obj1.flatten())
print("Simple Matching Coefficient:", smc)

# Jaccard needs binary input
obj1_bin = (obj1 > 0).astype(int).flatten()
obj2_bin = (obj2 > 0).astype(int).flatten()
jac_sim = 1 - jaccard(obj1_bin, obj2_bin)
print("Jaccard similarity:", jac_sim)


Index(['CustomerID', 'Title', 'FirstName', 'MiddleName', 'LastName', 'Suffix',
       'AddressLine1', 'AddressLine2', 'City', 'StateProvinceName',
       'CountryRegionName', 'PostalCode', 'PhoneNumber', 'BirthDate',
       'Education', 'Occupation', 'Gender', 'MaritalStatus', 'HomeOwnerFlag',
       'NumberCarsOwned', 'NumberChildrenAtHome', 'TotalChildren',
       'YearlyIncome', 'LastUpdated'],
      dtype='object')
Cosine similarity: [[0.86014719]]
Simple Matching Coefficient: 0.7368421052631579
Jaccard similarity: 0.7777777777777778


  df_selected.loc[:, [YEARLY_INCOME_COL]] = scaler.fit_transform(df_selected[[YEARLY_INCOME_COL]])
