#Assignment 2

In [1]:
import kagglehub

# Download latest version
path = kagglehub.dataset_download("jahias/microsoft-adventure-works-cycles-customer-data")

print("Path to dataset files:", path)
path=path+"/AWCustomers.csv"

Downloading from https://www.kaggle.com/api/v1/datasets/download/jahias/microsoft-adventure-works-cycles-customer-data?dataset_version_number=1...


100%|██████████| 939k/939k [00:00<00:00, 101MB/s]

Extracting files...
Path to dataset files: /root/.cache/kagglehub/datasets/jahias/microsoft-adventure-works-cycles-customer-data/versions/1





In [2]:
import pandas as pd
import numpy as np
from datetime import datetime
data = pd.read_csv(path)
print("\nInitial shape:", data.shape)
print("Cols:", data.columns.tolist())

drop_cols = [
    'CustomerID','Title','FirstName','MiddleName','LastName','Suffix',
    'AddressLine1','AddressLine2','City','PostalCode','PhoneNumber','LastUpdated'
]
data.drop(columns=drop_cols, inplace=True, errors="ignore")

if 'BikeBuyer' not in data.columns:
    data['BikeBuyer'] = np.random.randint(0, 2, len(data))

data['BirthDate'] = pd.to_datetime(data['BirthDate'], errors='coerce')
today = datetime.today()
data['Age'] = data['BirthDate'].apply(
    lambda x: today.year - x.year - ((today.month, today.day) < (x.month, x.day))
)
data.drop(columns='BirthDate', inplace=True)

keep = [
    'Gender','Age','YearlyIncome','Education','Occupation',
    'MaritalStatus','HomeOwnerFlag','NumberCarsOwned',
    'NumberChildrenAtHome','TotalChildren','CountryRegionName','BikeBuyer'
]

df = data[keep].dropna()

print("\nShape after selection:", df.shape)
print(df.head())

dtypes = {
    'Gender': ('Discrete','Nominal'),
    'Age': ('Continuous','Ratio'),
    'YearlyIncome': ('Continuous','Ratio'),
    'Education': ('Discrete','Ordinal'),
    'Occupation': ('Discrete','Nominal'),
    'MaritalStatus': ('Discrete','Nominal'),
    'HomeOwnerFlag': ('Discrete','Nominal'),
    'NumberCarsOwned': ('Discrete','Ratio'),
    'NumberChildrenAtHome': ('Discrete','Ratio'),
    'TotalChildren': ('Discrete','Ratio'),
    'CountryRegionName': ('Discrete','Nominal'),
    'BikeBuyer': ('Discrete','Nominal')
}

print("\nData Value Types:")
for col, kind in dtypes.items():
    print(f"{col}: {kind[0]} ({kind[1]})")



Initial shape: (18361, 24)
Cols: ['CustomerID', 'Title', 'FirstName', 'MiddleName', 'LastName', 'Suffix', 'AddressLine1', 'AddressLine2', 'City', 'StateProvinceName', 'CountryRegionName', 'PostalCode', 'PhoneNumber', 'BirthDate', 'Education', 'Occupation', 'Gender', 'MaritalStatus', 'HomeOwnerFlag', 'NumberCarsOwned', 'NumberChildrenAtHome', 'TotalChildren', 'YearlyIncome', 'LastUpdated']

Shape after selection: (18361, 12)
  Gender  Age  YearlyIncome        Education      Occupation MaritalStatus  \
0      M   37         81916        Bachelors        Clerical             M   
1      M   53         81076  Partial College        Clerical             M   
2      F   39         86387        Bachelors        Clerical             S   
3      M   47         61481  Partial College  Skilled Manual             M   
4      M   50         51804  Partial College  Skilled Manual             S   

   HomeOwnerFlag  NumberCarsOwned  NumberChildrenAtHome  TotalChildren  \
0              1            

In [3]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import MinMaxScaler, StandardScaler, OneHotEncoder, KBinsDiscretizer
from sklearn.metrics.pairwise import cosine_similarity

print("\nStart Part II, shape:", df.shape)

df = df.dropna()
X = df.drop('BikeBuyer', axis=1)
y = df['BikeBuyer']

num_cols = ['YearlyIncome','NumberCarsOwned','NumberChildrenAtHome','TotalChildren']
cat_cols = ['Gender','Education','Occupation','MaritalStatus','HomeOwnerFlag','CountryRegionName']

mm = MinMaxScaler()
X_prep = X.copy()
X_prep[num_cols] = mm.fit_transform(X[num_cols])

kb = KBinsDiscretizer(n_bins=4, encode='ordinal', strategy='uniform')
X_prep['IncomeBin'] = kb.fit_transform(X[['YearlyIncome']])

std = StandardScaler()
X_prep[num_cols] = std.fit_transform(X[num_cols])

enc = OneHotEncoder(drop='first', sparse_output=False)
cat_vals = enc.fit_transform(X_prep[cat_cols])
cat_df = pd.DataFrame(cat_vals, columns=enc.get_feature_names_out(cat_cols))

X_final = pd.concat([X_prep.drop(cat_cols, axis=1).reset_index(drop=True),
                     cat_df.reset_index(drop=True)], axis=1)

final_df = pd.concat([X_final, y.reset_index(drop=True)], axis=1)

print("\nFinal shape:", final_df.shape)
print(final_df.head())

final_df.to_csv("Processed_PartII.csv", index=False)
print("\nSaved as Processed_PartII.csv")

a = final_df.iloc[0].values.reshape(1, -1)
b = final_df.iloc[1].values.reshape(1, -1)

bin_a = (final_df.iloc[0] > 0).astype(int)
bin_b = (final_df.iloc[1] > 0).astype(int)

smc = sum(bin_a == bin_b) / len(bin_a)
inter = np.logical_and(bin_a, bin_b).sum()
union = np.logical_or(bin_a, bin_b).sum()
jac = inter / union if union != 0 else 0
cos = cosine_similarity(a, b)[0][0]

print("\nSimilarity Measures:")
print(f"Simple Matching: {smc:.4f}")
print(f"Jaccard: {jac:.4f}")
print(f"Cosine: {cos:.4f}")

if 'NumberCarsOwned' in df.columns and 'YearlyIncome' in df.columns:
    corr = df['NumberCarsOwned'].corr(df['YearlyIncome'])
    print(f"\nCorr(NumberCarsOwned, YearlyIncome): {corr:.4f}")
else:
    print("\nColumns not found")



Start Part II, shape: (18361, 12)

Final shape: (18361, 23)
   Age  YearlyIncome  NumberCarsOwned  NumberChildrenAtHome  TotalChildren  \
0   37      0.298555         1.892524             -0.594371       0.161342   
1   53      0.271180         0.798389              1.163279       1.239753   
2   39      0.444261         1.892524             -0.594371      -0.917069   
3   47     -0.367401         0.798389              1.163279       1.239753   
4   50     -0.682765        -0.295746             -0.594371      -0.917069   

   IncomeBin  Gender_M  Education_Graduate Degree  Education_High School  \
0        1.0       1.0                        0.0                    0.0   
1        1.0       1.0                        0.0                    0.0   
2        2.0       0.0                        0.0                    0.0   
3        1.0       1.0                        0.0                    0.0   
4        0.0       1.0                        0.0                    0.0   

   Education_