In [6]:
import pandas as pd
from sklearn.preprocessing import StandardScaler

# Load datasets
customers = pd.read_csv('Customers.csv')
products = pd.read_csv('Products.csv')
transactions = pd.read_csv('Transactions.csv')

# Merge datasets
data = transactions.merge(customers, on='CustomerID').merge(products, on='ProductID')

# Rename 'Price_y' to 'Price' and drop 'Price_x' if needed
data.rename(columns={'Price_y': 'Price'}, inplace=True)
data.drop(columns=['Price_x'], inplace=True)

# Verify the updated columns
print(data.columns)


# Feature engineering: Aggregate data at the customer level
customer_features = data.groupby('CustomerID').agg({
    'TotalValue': 'sum',  # Total spending
    'TransactionID': 'count',  # Number of transactions
    'Price': 'mean',  # Average product price
    'Region': lambda x: x.mode()[0],  # Most frequent region
    'Category': lambda x: x.mode()[0],  # Most frequent product category
}).reset_index()

# Convert categorical features to numerical using one-hot encoding
customer_features = pd.get_dummies(customer_features, columns=['Region', 'Category'])

# Standardize numerical features
scaler = StandardScaler()
numerical_cols = ['TotalValue', 'TransactionID', 'Price']
customer_features[numerical_cols] = scaler.fit_transform(customer_features[numerical_cols])

print(customer_features.head())


Index(['TransactionID', 'CustomerID', 'ProductID', 'TransactionDate',
       'Quantity', 'TotalValue', 'CustomerName', 'Region', 'SignupDate',
       'ProductName', 'Category', 'Price'],
      dtype='object')
  CustomerID  TotalValue  TransactionID     Price  Region_Asia  Region_Europe  \
0      C0001   -0.061701      -0.011458  0.094670        False          False   
1      C0002   -0.877744      -0.467494 -0.904016         True          False   
2      C0003   -0.405857      -0.467494 -1.094109        False          False   
3      C0004    1.032547       1.356650 -0.447702        False          False   
4      C0005   -0.783929      -0.923530  0.285581         True          False   

   Region_North America  Region_South America  Category_Books  \
0                 False                  True           False   
1                 False                 False           False   
2                 False                  True           False   
3                 False                  Tru

In [7]:
from sklearn.metrics.pairwise import cosine_similarity

# Compute similarity matrix
similarity_matrix = cosine_similarity(customer_features.drop('CustomerID', axis=1))

# Convert the similarity matrix to a DataFrame for easier access
similarity_df = pd.DataFrame(similarity_matrix, index=customer_features['CustomerID'], columns=customer_features['CustomerID'])

# Preview similarity matrix
print(similarity_df.head())


CustomerID     C0001     C0002     C0003     C0004     C0005     C0006  \
CustomerID                                                               
C0001       1.000000 -0.009418  0.345240  0.273953  0.406312  0.422097   
C0002      -0.009418  1.000000  0.423637 -0.257615  0.506524 -0.269005   
C0003       0.345240  0.423637  1.000000  0.102089  0.122719  0.030062   
C0004       0.273953 -0.257615  0.102089  1.000000 -0.514452  0.351121   
C0005       0.406312  0.506524  0.122719 -0.514452  1.000000  0.105804   

CustomerID     C0007     C0008     C0009     C0010  ...     C0191     C0192  \
CustomerID                                          ...                       
C0001       0.387376 -0.027575  0.042388 -0.027808  ...  0.502552  0.866113   
C0002       0.206259 -0.169922  0.531350  0.750730  ...  0.029068  0.213310   
C0003      -0.152739  0.076239  0.136433  0.531577  ...  0.339590  0.408245   
C0004      -0.478770  0.605671 -0.570818 -0.154720  ...  0.503359 -0.134963   
C0005  

In [9]:
# Generate lookalike recommendations for the first 20 customers
lookalike_results = {}
for customer_id in customer_features['CustomerID'][:20]:
    similar_customers = similarity_df[customer_id].sort_values(ascending=False)[1:4]  # Exclude self-similarity
    lookalike_results[customer_id] = similar_customers.index.tolist(), similar_customers.values.tolist()

# Convert results to a DataFrame
lookalike_list = []
for cust_id, (similar_ids, scores) in lookalike_results.items():
    for sim_id, score in zip(similar_ids, scores):
        lookalike_list.append({'cust_id': cust_id, 'similar_cust_id': sim_id, 'score': score})

lookalike_df = pd.DataFrame(lookalike_list)

# Preview results
print(lookalike_df.head())

# Save recommendations to a CSV file
lookalike_df.to_csv('Siliveri_Sriharshini_Lookalike.csv', index=False)


  cust_id similar_cust_id     score
0   C0001           C0181  0.943416
1   C0001           C0192  0.866113
2   C0001           C0190  0.848125
3   C0002           C0088  0.959385
4   C0002           C0134  0.908392
