In [1]:
import pandas as pd
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.preprocessing import StandardScaler
import numpy as np

In [3]:
customers_df = pd.read_csv('/content/drive/MyDrive/Customers.csv')


In [4]:
products_df = pd.read_csv('/content/drive/MyDrive/Products.csv')
transactions_df = pd.read_csv('/content/drive/MyDrive/Transactions.csv')

In [5]:
trans_prod_df = pd.merge(transactions_df, products_df, on='ProductID', how='left')


In [6]:
customer_purchase_data = trans_prod_df.groupby(['CustomerID', 'Category']).agg(
    total_spend=('TotalValue', 'sum'),
    num_purchases=('TransactionID', 'count')
).reset_index()


In [7]:
customer_profile = customer_purchase_data.pivot_table(index='CustomerID', columns='Category',
                                                      values='num_purchases', fill_value=0)

In [8]:
scaler = StandardScaler()
customer_profile_scaled = scaler.fit_transform(customer_profile)

In [9]:
similarity_matrix = cosine_similarity(customer_profile_scaled)


In [10]:
lookalike_results = {}


In [12]:
for customer_id in customers_df['CustomerID'].iloc[:20]:
    customer_index = customers_df[customers_df['CustomerID'] == customer_id].index[0]
    similarity_scores = similarity_matrix[customer_index]

    # Get indices of the top 3 most similar customers (excluding the customer itself)
    similar_customers = np.argsort(similarity_scores)[::-1][1:4]

    # Create a list of tuples containing (customer_id, similarity_score)
    similar_customers_with_scores = [(customers_df['CustomerID'].iloc[i], similarity_scores[i]) for i in similar_customers]

    # Store the result in a dictionary
    lookalike_results[customer_id] = similar_customers_with_scores

In [13]:
lookalike_df = pd.DataFrame([(cust_id, similar_cust, score)
                             for cust_id, sim_customers in lookalike_results.items()
                             for similar_cust, score in sim_customers],
                            columns=['CustomerID', 'LookalikeCustomerID', 'SimilarityScore'])



In [15]:
lookalike_df.to_csv('Sameer_Pawar_Lookalike.csv', index=False)

lookalike_df.head(20)

Unnamed: 0,CustomerID,LookalikeCustomerID,SimilarityScore
0,C0001,C0069,0.950055
1,C0001,C0146,0.913483
2,C0001,C0035,0.913483
3,C0002,C0002,1.0
4,C0002,C0134,0.941171
5,C0002,C0103,0.894115
6,C0003,C0158,1.0
7,C0003,C0003,1.0
8,C0003,C0166,1.0
9,C0004,C0047,0.932888
