In [1]:
import pandas as pd

In [23]:
import numpy as np
from datetime import datetime
from sklearn.preprocessing import MinMaxScaler, StandardScaler, OneHotEncoder, KBinsDiscretizer
from sklearn.metrics.pairwise import cosine_similarity

In [3]:
df = pd.read_csv("AWCustomers.csv")
sales_df = pd.read_csv("AWSales.csv")
print(df.shape)
print(sales_df.shape)

(18361, 24)
(18355, 3)


In [4]:
df = df.merge(sales_df[['CustomerID', 'BikeBuyer']], on='CustomerID', how='left')
print(df.head(12))

    CustomerID Title FirstName MiddleName  LastName Suffix  \
0        21173   NaN      Chad          C      Yuan    NaN   
1        13249   NaN      Ryan        NaN     Perry    NaN   
2        29350   NaN     Julia        NaN  Thompson    NaN   
3        13503   NaN  Theodore        NaN     Gomez    NaN   
4        22803   NaN  Marshall          J      Shan    NaN   
5        22092   NaN    Andrea        NaN     Young    NaN   
6        11229   NaN    Adrian          C   Stewart    NaN   
7        24179   NaN     Robyn          R    Blanco    NaN   
8        12127   NaN     Grace        NaN     Davis    NaN   
9        19903   NaN    Marcus          N    Nelson    NaN   
10       25924   NaN    Hannah        NaN  Robinson    NaN   
11       23033   NaN      Alex          J    Turner    NaN   

              AddressLine1 AddressLine2          City    StateProvinceName  \
0       7090 C. Mount Hood          NaN    Wollongong      New South Wales   
1      3651 Willow Lake Rd          N

In [6]:
#  Part (a)
columns_to_remove = ['Title', 'CustomerID','FirstName','MiddleName','LastName','Suffix', 'AddressLine1', 'AddressLine2','City','StateProvinceName','PhoneNumber', 
    'LastUpdated','PostalCode']
df.drop(columns=columns_to_remove, axis=1, inplace=True)

In [7]:
print(df.head(5))

  CountryRegionName   BirthDate        Education      Occupation Gender  \
0         Australia  1987-11-13        Bachelors        Clerical      M   
1            Canada  1972-07-21  Partial College        Clerical      M   
2     United States  1985-11-09        Bachelors        Clerical      F   
3    United Kingdom  1977-10-18  Partial College  Skilled Manual      M   
4           Germany  1975-02-05  Partial College  Skilled Manual      M   

  MaritalStatus  HomeOwnerFlag  NumberCarsOwned  NumberChildrenAtHome  \
0             M              1                3                     0   
1             M              1                2                     1   
2             S              0                3                     0   
3             M              1                2                     1   
4             S              1                1                     0   

   TotalChildren  YearlyIncome  BikeBuyer  
0              1         81916          1  
1              2      

In [10]:
df['BirthDate'] = pd.to_datetime(df['BirthDate'], errors='coerce')
current_year = datetime.now().year
df['Age'] = current_year - df['BirthDate'].dt.year
df.drop(columns=['BirthDate'], inplace=True)

In [11]:
print(df.head(2))

  CountryRegionName        Education Occupation Gender MaritalStatus  \
0         Australia        Bachelors   Clerical      M             M   
1            Canada  Partial College   Clerical      M             M   

   HomeOwnerFlag  NumberCarsOwned  NumberChildrenAtHome  TotalChildren  \
0              1                3                     0              1   
1              1                2                     1              2   

   YearlyIncome  BikeBuyer  Age  
0         81916          1   38  
1         81076          1   53  


In [12]:
# (b)
selected_features = [
    'Gender', 'Age', 'YearlyIncome', 'Education', 'Occupation',
    'MaritalStatus', 'HomeOwnerFlag', 'NumberCarsOwned',
    'NumberChildrenAtHome', 'TotalChildren', 'CountryRegionName', 'BikeBuyer'
]

df_selected = df[selected_features].copy()
print("\nDataFrame shape after feature selection:", df_selected.shape)


DataFrame shape after feature selection: (18361, 12)


In [13]:
# (c)
data_types = {
    'Gender': ('Discrete', 'Nominal'),
    'Age': ('Continuous', 'Ratio'),
    'YearlyIncome': ('Continuous', 'Ratio'),
    'Education': ('Discrete', 'Ordinal'),
    'Occupation': ('Discrete', 'Nominal'),
    'MaritalStatus': ('Discrete', 'Nominal'),
    'HomeOwnerFlag': ('Discrete', 'Nominal'),
    'NumberCarsOwned': ('Discrete', 'Ratio'),
    'NumberChildrenAtHome': ('Discrete', 'Ratio'),
    'TotalChildren': ('Discrete', 'Ratio'),
    'CountryRegionName': ('Discrete', 'Nominal'),
    'BikeBuyer': ('Discrete', 'Nominal')
}

print("\nData Value Types:")
for col, dtype in data_types.items():
    print(f"{col}: {dtype[0]} ({dtype[1]})")

print(f"\nSample of selected data:")
print(df_selected.head())


Data Value Types:
Gender: Discrete (Nominal)
Age: Continuous (Ratio)
YearlyIncome: Continuous (Ratio)
Education: Discrete (Ordinal)
Occupation: Discrete (Nominal)
MaritalStatus: Discrete (Nominal)
HomeOwnerFlag: Discrete (Nominal)
NumberCarsOwned: Discrete (Ratio)
NumberChildrenAtHome: Discrete (Ratio)
TotalChildren: Discrete (Ratio)
CountryRegionName: Discrete (Nominal)
BikeBuyer: Discrete (Nominal)

Sample of selected data:
  Gender  Age  YearlyIncome        Education      Occupation MaritalStatus  \
0      M   38         81916        Bachelors        Clerical             M   
1      M   53         81076  Partial College        Clerical             M   
2      F   40         86387        Bachelors        Clerical             S   
3      M   48         61481  Partial College  Skilled Manual             M   
4      M   50         51804  Partial College  Skilled Manual             S   

   HomeOwnerFlag  NumberCarsOwned  NumberChildrenAtHome  TotalChildren  \
0              1          

In [14]:
# Part 2

In [15]:
# a
df_clean = df_selected.dropna()
print(f"Shape after removing null values: {df_clean.shape}")

Shape after removing null values: (18361, 12)


In [16]:
X = df_clean.drop('BikeBuyer', axis=1)
y = df_clean['BikeBuyer']

numeric_cols = ['Age', 'YearlyIncome', 'NumberCarsOwned', 'NumberChildrenAtHome', 'TotalChildren']
categorical_cols = ['Gender', 'Education', 'Occupation', 'MaritalStatus', 'HomeOwnerFlag', 'CountryRegionName']

In [19]:
# (b)
scaler = MinMaxScaler()
X_processed = X.copy()
X_processed[numeric_cols] = scaler.fit_transform(X[numeric_cols])

# (c)
discretizer = KBinsDiscretizer(n_bins=4, encode='ordinal', strategy='uniform')
X_processed['YearlyIncome_Binned'] = discretizer.fit_transform(X[['YearlyIncome']])

# (d)
std_scaler = StandardScaler()
X_processed[numeric_cols] = std_scaler.fit_transform(X[numeric_cols])

# (e)
encoder = OneHotEncoder(drop='first', sparse_output=False)
encoded_cat = encoder.fit_transform(X_processed[categorical_cols])
encoded_cat_df = pd.DataFrame(
    encoded_cat, 
    columns=encoder.get_feature_names_out(categorical_cols),
    index=X_processed.index
)
X_final = pd.concat([
    X_processed.drop(categorical_cols, axis=1),
    encoded_cat_df
], axis=1)

final_df = pd.concat([X_final, y], axis=1)
print(f"\nFinal processed dataset shape: {final_df.shape}")
print("\nProcessed dataset sample:")
print(final_df.head())


Final processed dataset shape: (18361, 23)

Processed dataset sample:
        Age  YearlyIncome  NumberCarsOwned  NumberChildrenAtHome  \
0 -0.482516      0.298555         1.892524             -0.594371   
1  0.851033      0.271180         0.798389              1.163279   
2 -0.304710      0.444261         1.892524             -0.594371   
3  0.406517     -0.367401         0.798389              1.163279   
4  0.584324     -0.682765        -0.295746             -0.594371   

   TotalChildren  YearlyIncome_Binned  Gender_M  Education_Graduate Degree  \
0       0.161342                  1.0       1.0                        0.0   
1       1.239753                  1.0       1.0                        0.0   
2      -0.917069                  2.0       0.0                        0.0   
3       1.239753                  1.0       1.0                        0.0   
4      -0.917069                  0.0       1.0                        0.0   

   Education_High School  Education_Partial College

In [20]:
final_df.to_csv("Processed_Dataset.csv", index=False)
print("\nProcessed dataset saved as 'Processed_Dataset.csv'")


Processed dataset saved as 'Processed_Dataset.csv'


In [21]:
# Part 3

In [24]:
# (a)
obj1 = final_df.iloc[0].drop('BikeBuyer').values.reshape(1, -1)
obj2 = final_df.iloc[1].drop('BikeBuyer').values.reshape(1, -1)
binary_obj1 = (final_df.iloc[0].drop('BikeBuyer') > 0).astype(int)
binary_obj2 = (final_df.iloc[1].drop('BikeBuyer') > 0).astype(int)

# Simple Matching Coefficient
matches = sum(binary_obj1 == binary_obj2)
simple_matching = matches / len(binary_obj1)

# Jaccard Similarity
intersection = np.logical_and(binary_obj1, binary_obj2).sum()
union = np.logical_or(binary_obj1, binary_obj2).sum()
if union == 0:
    jaccard_similarity = 0
else:
    jaccard_similarity = intersection / union

# Cosine Similarity
cosine_sim = cosine_similarity(obj1, obj2)[0][0]

print("\nSimilarity Measures between Object 1 and Object 2:")
print(f"Simple Matching Coefficient: {simple_matching:.4f}")
print(f"Jaccard Similarity: {jaccard_similarity:.4f}")
print(f"Cosine Similarity: {cosine_sim:.4f}")


Similarity Measures between Object 1 and Object 2:
Simple Matching Coefficient: 0.8182
Jaccard Similarity: 0.6000
Cosine Similarity: 0.4477


In [25]:
# (b)
# CommuteDistance is not in the dataset