In [24]:
import pandas as pd
import numpy as np

# ---- helpers ----
def canonicalize_pairs(df, a='productid_1', b='productid_2'):
    df = df.copy()
    # ensure strings & strip whitespace
    df[a] = df[a].astype(str).str.strip()
    df[b] = df[b].astype(str).str.strip()
    # canonical order so (A,B) == (B,A)
    ij = np.sort(df[[a, b]].values, axis=1)
    df['p1'] = ij[:, 0]
    df['p2'] = ij[:, 1]
    return df

# ---- load ----
attribute   = pd.read_csv('../NewData/attribute_scores.csv')   # cols: productid_1, productid_2, attribute_score
transaction = pd.read_csv('../NewData/transaction_scores.csv')   # cols: productid_1, productid_2, transaction_score

# ---- canonicalize & dedupe ----
attr_c = canonicalize_pairs(attribute).drop_duplicates(['p1','p2'])
txn_c  = canonicalize_pairs(transaction).drop_duplicates(['p1','p2'])

# ---- merge (keep all attribute rows) ----
final_df = (attr_c[['p1','p2','attribute_score']]
            .merge(txn_c[['p1','p2','transaction_score']], on=['p1','p2'], how='left'))

# ---- substitution score ----
final_df['attribute_score']   = final_df['attribute_score'].astype(float)
final_df['transaction_score'] = final_df['transaction_score'].astype(float)

final_df['substitution_score'] = (
    0.6 * final_df['attribute_score'].fillna(0.0) +
    0.4 * final_df['transaction_score'].fillna(0.0)
).clip(0, 1)

print(final_df.shape)
print(final_df.head())

final_df.to_csv('../NewData/final_substitution_scores.csv', index=False)


(19900, 5)
      p1     p2  attribute_score  transaction_score  substitution_score
0  P0001  P0002         0.073209           0.397706            0.203007
1  P0001  P0003         0.064998           0.730883            0.331352
2  P0001  P0004         0.086591           0.742388            0.348910
3  P0001  P0006         0.447235           0.395690            0.426617
4  P0001  P0010         0.073405           0.764077            0.349674
