In [115]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import MinMaxScaler, KBinsDiscretizer, StandardScaler, OneHotEncoder # for normalization, for discretization, for standardization, for binarization
from scipy.spatial.distance import jaccard
from sklearn.metrics.pairwise import cosine_similarity
from scipy.stats import pearsonr

In [64]:
raw_df = pd.read_csv("AWCustomers.csv")

In [65]:
raw_df = pd.DataFrame(raw_df)

In [66]:
raw_df.columns

Index(['CustomerID', 'Title', 'FirstName', 'MiddleName', 'LastName', 'Suffix',
       'AddressLine1', 'AddressLine2', 'City', 'StateProvinceName',
       'CountryRegionName', 'PostalCode', 'PhoneNumber', 'BirthDate',
       'Education', 'Occupation', 'Gender', 'MaritalStatus', 'HomeOwnerFlag',
       'NumberCarsOwned', 'NumberChildrenAtHome', 'TotalChildren',
       'YearlyIncome', 'LastUpdated'],
      dtype='object')

In [67]:
# keeping columns relevant for future bike sale prediction
df = raw_df[['CustomerID', 'FirstName', 'City', 'StateProvinceName', 'CountryRegionName', 'PostalCode', 'BirthDate', 'Education', 'Occupation', 'Gender', 'MaritalStatus', 'HomeOwnerFlag', 'NumberCarsOwned', 'NumberChildrenAtHome', 'YearlyIncome']]

In [68]:
df.columns

Index(['CustomerID', 'FirstName', 'City', 'StateProvinceName',
       'CountryRegionName', 'PostalCode', 'BirthDate', 'Education',
       'Occupation', 'Gender', 'MaritalStatus', 'HomeOwnerFlag',
       'NumberCarsOwned', 'NumberChildrenAtHome', 'YearlyIncome'],
      dtype='object')

In [69]:
# different type of data value types
discrete = ['HomeOwnerFlag', 'NumberCarsOwned', 'NumberChildrenAtHome']
continuous = ['YearlyIncome']
nominal = ['CustomerID', 'FirstName', 'City', 'StateProvinceName', 'CountryRegionName', 'PostalCode', 'BirthDate', 'Occupation', 'Gender', 'MaritalStatus']
ordinal = ['Education']
interval = ['BirthDate']
ratio = ['YearlyIncome', 'NumberChildrenAtHome']

In [70]:
data_types = {'discrete' : discrete, 'continuous': continuous, 'nominal': nominal, 'ordinal': ordinal, 'interval': interval, 'ratio': ratio}

In [71]:
# handling missing values
df.isnull().sum()
df = df.fillna(0)

In [84]:
df

Unnamed: 0,CustomerID,FirstName,City,StateProvinceName,CountryRegionName,PostalCode,HomeOwnerFlag,NumberCarsOwned,NumberChildrenAtHome,YearlyIncome,...,Education_Graduate Degree,Education_High School,Education_Partial College,Education_Partial High School,Occupation_Management,Occupation_Manual,Occupation_Professional,Occupation_Skilled Manual,Gender_M,MaritalStatus_S
0,21173,Chad,Wollongong,New South Wales,Australia,2500,1,3,0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
1,13249,Ryan,Shawnee,British Columbia,Canada,V9B 2C3,1,2,1,1.0,...,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
2,29350,Julia,West Covina,California,United States,91791,0,3,0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
3,13503,Theodore,Liverpool,England,United Kingdom,L4 4HB,1,2,1,0.0,...,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0
4,22803,Marshall,Werne,Nordrhein-Westfalen,Germany,59368,1,1,0,0.0,...,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,1.0,1.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
18356,25414,Grace,Coronado,California,United States,92118,0,1,0,0.0,...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
18357,11459,Tasha,Port Macquarie,New South Wales,Australia,2444,0,2,0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0
18358,12160,Jaclyn,Beaverton,Oregon,United States,97005,0,2,0,0.0,...,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0
18359,14353,Erin,Vancouver,British Columbia,Canada,V7L 4J4,0,0,0,1.0,...,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0


In [73]:
# converting birthdate to age for better data mining
df['BirthDate'] = pd.to_datetime(df['BirthDate'], errors = 'coerce') # coerce is used in case date has invalid form or is null
df['Age'] = (pd.Timestamp('today') - df['BirthDate']).dt.days // 365# finding age by subtracting the dates findind the no of days and integer divison with 365 for years
df = df.drop(columns=["BirthDate"])

In [75]:
# normalizing age and yearly income for Normalization ensures all features contribute fairly
num_cols = ["Age", "YearlyIncome"]
scaler = MinMaxScaler()
df[num_cols] = scaler.fit_transform(df[num_cols])

In [77]:
# Discretization = converting continuous numeric data into categories (bins).
discretizer = KBinsDiscretizer(n_bins = 3, encode='ordinal', strategy = 'uniform')
df['YearlyIncome'] = discretizer.fit_transform(df[['YearlyIncome']])

In [79]:
#Standardization = scaling numeric data so it has: mean:0, sd = 1
scaler = StandardScaler()
df[['Age']] = scaler.fit_transform(df[['Age']])

In [81]:
#Binarization = converting categorical attributes into binary dummy variables (0/1).
categorical_cols= ['Education','Occupation','Gender', 'MaritalStatus']
encoder = OneHotEncoder(drop = 'first', sparse_output=False)
encoded = pd.DataFrame(encoder.fit_transform(df[categorical_cols]), columns = encoder.get_feature_names_out(categorical_cols), index = df.index)
df = pd.concat([df.drop(columns=categorical_cols), encoded], axis = 1)

In [108]:
df

Unnamed: 0,CustomerID,HomeOwnerFlag,NumberCarsOwned,NumberChildrenAtHome,YearlyIncome,Age,Education_Graduate Degree,Education_High School,Education_Partial College,Education_Partial High School,Occupation_Management,Occupation_Manual,Occupation_Professional,Occupation_Skilled Manual,Gender_M,MaritalStatus_S
0,21173,1,3,0,1.0,-0.543627,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
1,13249,1,2,1,1.0,0.876083,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
2,29350,0,3,0,1.0,-0.366164,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
3,13503,1,2,1,0.0,0.343692,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0
4,22803,1,1,0,0.0,0.609888,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,1.0,1.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
18356,25414,0,1,0,0.0,-0.809823,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
18357,11459,0,2,0,0.0,-0.987287,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0
18358,12160,0,2,0,0.0,-0.188700,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0
18359,14353,0,0,0,1.0,-1.164751,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0


In [109]:
df.drop(columns = ['CustomerID'])

Unnamed: 0,HomeOwnerFlag,NumberCarsOwned,NumberChildrenAtHome,YearlyIncome,Age,Education_Graduate Degree,Education_High School,Education_Partial College,Education_Partial High School,Occupation_Management,Occupation_Manual,Occupation_Professional,Occupation_Skilled Manual,Gender_M,MaritalStatus_S
0,1,3,0,1.0,-0.543627,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
1,1,2,1,1.0,0.876083,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
2,0,3,0,1.0,-0.366164,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
3,1,2,1,0.0,0.343692,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0
4,1,1,0,0.0,0.609888,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,1.0,1.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
18356,0,1,0,0.0,-0.809823,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
18357,0,2,0,0.0,-0.987287,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0
18358,0,2,0,0.0,-0.188700,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0
18359,0,0,0,1.0,-1.164751,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0


In [110]:
row1 = df.iloc[0].values
row2 = df.iloc[1].values

def simple_matching(a, b):
    matches = sum(a==b)
    return matches/len(a)

sms = simple_matching(row1, row2)
sms

np.float64(0.6875)

In [111]:
def jaccard_similarity(a, b):
    return 1 - jaccard(a, b)

jaccard_similarity(row1, row2)


np.float64(0.75)

In [114]:
cosine_similarity([row1], [row2])[0][0]

np.float64(0.9999999888497483)

In [118]:
income = df['YearlyIncome']
age = df['Age']
corr, pval = pearsonr(income, age)
corr


np.float64(0.024942889620524367)