### Preparation

Import Libraries

In [76]:
import pandas as pd
from itertools import combinations

Read data

In [77]:
products = pd.read_csv('../NewData/products.csv')

Check data distrbution

In [78]:
products['category'].value_counts()

category
Batteries         43
Oil               35
Spare Parts       31
Accessories       31
Tires             31
Shock Absorber    29
Name: count, dtype: int64

In [79]:
products['grade'].value_counts()

grade
Economy     74
Premium     64
Standard    62
Name: count, dtype: int64

In [80]:
products['material'].value_counts()

material
Steel               28
Lithium             23
Lead Acid           20
Synthetic Oil       20
Synthetic Rubber    19
Composite           18
Alloy               16
Aluminum            16
Mineral Oil         15
Plastic             13
Rubber              12
Name: count, dtype: int64

In [81]:
products['brand'].value_counts()

brand
DriveWell    48
SpeedLine    42
TorqueX      42
AutoMax      41
GearPro      27
Name: count, dtype: int64

In [82]:
products['vehicle_type'].value_counts()

vehicle_type
MPV          48
Crossover    43
SUV          38
Hatchback    36
Sedan        35
Name: count, dtype: int64

In [83]:
products['compatible_vehicle'] = products['compatible_vehicle'].apply(lambda x: x.split(',') if isinstance(x, str) else x)
products['compatible_vehicle_count'] = products['compatible_vehicle'].apply(lambda x: len(x) if isinstance(x, list) else 0)
products['compatible_vehicle_count'].value_counts()

compatible_vehicle_count
2    108
1     92
Name: count, dtype: int64

In [84]:
products['unit_price'].describe()

count    200.000000
mean     263.978800
std      137.498935
min       16.200000
25%      152.960000
50%      258.690000
75%      385.070000
max      498.630000
Name: unit_price, dtype: float64

### Compute Attribute Score

The attribute score between two product A and product B is defined as follows:

attribute score = (0.5  * {category A == category B}) + (0.05 * {grade A == grade B}) + (0.05 * {material A == material B}) +  (0.15 * {vehicle_type A == vehicle_type B}) + (0.15 * {there is intersection between compatible_vehicle A and B}) + (0.1 * {Difference price between product A and product B standardized into range [0, 1]})

In [85]:
products.head()

Unnamed: 0,product_id,product_name,category,grade,material,brand,vehicle_type,compatible_vehicle,unit_price,compatible_vehicle_count
0,P0111,AutoMax Shock Absorber 159,Shock Absorber,Economy,Alloy,AutoMax,Crossover,[Honda HR-V],236.55,1
1,P0014,AutoMax Shock Absorber 316,Shock Absorber,Premium,Alloy,AutoMax,Sedan,"[Honda Civic, Hyundai Elantra]",114.15,2
2,P0017,AutoMax Shock Absorber 396,Shock Absorber,Economy,Alloy,AutoMax,Hatchback,"[Mazda 2, Suzuki Swift]",325.47,2
3,P0149,AutoMax Shock Absorber 711,Shock Absorber,Premium,Alloy,AutoMax,SUV,[Honda CR-V],283.8,1
4,P0100,AutoMax Shock Absorber 756,Shock Absorber,Economy,Alloy,AutoMax,SUV,"[Toyota Rush, Daihatsu Terios]",153.01,2


In [86]:
def compute_attribute_score(product_a, product_b, min_price, max_price):
    # Category
    category_score = 0.5 if product_a['category'] == product_b['category'] else 0.0
    # Grade
    grade_score = 0.05 if product_a['grade'] == product_b['grade'] else 0.0
    # Material
    material_score = 0.05 if product_a['material'] == product_b['material'] else 0.0
    # Vehicle Type
    vehicle_type_score = 0.15 if product_a['vehicle_type'] == product_b['vehicle_type'] else 0.0
    # Compatible Vehicle Intersection
    vehicles_a = [v.strip() for v in product_a['compatible_vehicle']] if isinstance(product_a['compatible_vehicle'], list) else []
    vehicles_b = [v.strip() for v in product_b['compatible_vehicle']] if isinstance(product_b['compatible_vehicle'], list) else []
    intersection_score = 0.15 if set(vehicles_a) & set(vehicles_b) else 0.0
    # Price Difference (standardized)
    price_diff = abs(product_a['unit_price'] - product_b['unit_price'])
    price_score = 0.1 * (1 - (price_diff / (max_price - min_price))) if max_price > min_price else 0.0

    return category_score + grade_score + material_score + vehicle_type_score + intersection_score + price_score

In [87]:
# Generate attribute scores for all product pairs
pairs = list(combinations(products.index, 2))
min_price = products['unit_price'].min()
max_price = products['unit_price'].max()

results = []
for idx1, idx2 in pairs:
    prod1 = products.loc[idx1]
    prod2 = products.loc[idx2]
    score = compute_attribute_score(prod1, prod2, min_price, max_price)
    results.append({
        'productid_1': prod1['product_id'],
        'productid_2': prod2['product_id'],
        'attribute_score': score
    })
    
results.sort(key=lambda x: (x['productid_1'], x['productid_2']))
attribute_scores_df = pd.DataFrame(results)
attribute_scores_df.head()

Unnamed: 0,productid_1,productid_2,attribute_score
0,P0001,P0002,0.073209
1,P0001,P0003,0.064998
2,P0001,P0004,0.086591
3,P0001,P0006,0.447235
4,P0001,P0010,0.073405


In [88]:
attribute_scores_df.to_csv('../NewData/attribute_scores.csv', index=False)