In [3]:
import pandas as pd
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.preprocessing import StandardScaler

In [4]:
customers = pd.read_csv("Customers.csv")
products = pd.read_csv("Products.csv")
transactions = pd.read_csv("Transactions.csv")

In [8]:
merged_data = transactions.merge(customers, on='CustomerID').merge(products, on='ProductID')

In [12]:
customer_profile = merged_data.groupby('CustomerID').agg({
    'ProductID': 'count',
    'TotalValue': 'sum',
    'Price_y': 'mean',
    'Quantity': 'sum'
}).rename(columns={
    'ProductID': 'TotalTransactions',
    'TotalValue': 'TotalRevenue',
    'Price_y': 'AvgPrice',
    'Quantity': 'TotalQuantity'
})

In [13]:
scaler = StandardScaler()
customer_profile_normalized = scaler.fit_transform(customer_profile)

In [14]:
similarity_matrix = cosine_similarity(customer_profile_normalized)
similarity_df = pd.DataFrame(similarity_matrix, index=customer_profile.index, columns=customer_profile.index)

lookalike_map = {}
for customer_id in customer_profile.index[:20]:
    similar_customers = similarity_df.loc[customer_id].sort_values(ascending=False)[1:4]
    lookalike_map[customer_id] = [(sim_cust_id, score) for sim_cust_id, score in similar_customers.items()]

In [15]:
lookalike_df = pd.DataFrame.from_dict(lookalike_map, orient='index', columns=['Lookalike1', 'Lookalike2', 'Lookalike3'])
lookalike_df.to_csv("Lookalike.csv")

In [16]:
print("Lookalike Map for First 20 Customers:")
lookalike_df.head(20)

Lookalike Map for First 20 Customers:


Unnamed: 0,Lookalike1,Lookalike2,Lookalike3
C0001,"(C0103, 0.9957886697482402)","(C0137, 0.9811454761725985)","(C0191, 0.9528251146146083)"
C0002,"(C0029, 0.999831588403425)","(C0077, 0.9951840919025055)","(C0025, 0.988549352110799)"
C0003,"(C0010, 0.9541919448829387)","(C0176, 0.9336110676630717)","(C0190, 0.9321707249278193)"
C0004,"(C0075, 0.9972330923018664)","(C0165, 0.9927777226734699)","(C0124, 0.9849630112143049)"
C0005,"(C0130, 0.9984971380139361)","(C0020, 0.9976058697806417)","(C0128, 0.9959830379580356)"
C0006,"(C0079, 0.9875327847394931)","(C0168, 0.9720408879057841)","(C0196, 0.9658596014222852)"
C0007,"(C0125, 0.9987680196006534)","(C0085, 0.9967303450167071)","(C0089, 0.989226265604518)"
C0008,"(C0084, 0.98976466569664)","(C0179, 0.9897381731710889)","(C0090, 0.9800901486917245)"
C0009,"(C0192, 0.9987960773716095)","(C0128, 0.9876085017979392)","(C0061, 0.9733022317852845)"
C0010,"(C0142, 0.9844170187674133)","(C0121, 0.9742317763914193)","(C0055, 0.9703000423206266)"


In [17]:
merged_data['Category'] = merged_data['Category'].fillna('Unknown')  # Handle missing categories
customer_product_interests = pd.crosstab(merged_data['CustomerID'], merged_data['Category'])
scaler = StandardScaler()
customer_product_normalized = scaler.fit_transform(customer_product_interests)

# Calculating similarity based on product interests
similarity_matrix = cosine_similarity(customer_product_normalized)
similarity_df = pd.DataFrame(similarity_matrix, index=customer_product_interests.index, columns=customer_product_interests.index)

# Finding top 3 lookalikes for the first 20 customers
lookalike_map = {}
for customer_id in customer_product_interests.index[:20]:
    similar_customers = similarity_df.loc[customer_id].sort_values(ascending=False)[1:4]  # Exclude self
    lookalike_map[customer_id] = [(sim_cust_id, score) for sim_cust_id, score in similar_customers.items()]

# Saving the lookalike map to a CSV file
lookalike_df = pd.DataFrame.from_dict(lookalike_map, orient='index', columns=['Lookalike1', 'Lookalike2', 'Lookalike3'])
lookalike_df.to_csv("Lookalike_Criteria_2.csv")

print("Lookalike Map for First 20 Customers:")
lookalike_df.head(20)

Lookalike Map for First 20 Customers:


Unnamed: 0,Lookalike1,Lookalike2,Lookalike3
C0001,"(C0069, 0.9500548112992332)","(C0035, 0.9134825860307535)","(C0146, 0.9134825860307535)"
C0002,"(C0133, 0.9999999999999999)","(C0134, 0.9411705684349593)","(C0103, 0.8941146799902356)"
C0003,"(C0166, 1.0000000000000002)","(C0158, 1.0000000000000002)","(C0031, 1.0000000000000002)"
C0004,"(C0047, 0.9328877664310049)","(C0194, 0.8994875108213158)","(C0090, 0.8994875108213158)"
C0005,"(C0007, 0.9999999999999999)","(C0197, 0.9999999999999999)","(C0120, 0.8793075683750426)"
C0006,"(C0006, 1.0000000000000002)","(C0132, 0.8556577164646645)","(C0142, 0.8512429640356879)"
C0007,"(C0007, 0.9999999999999999)","(C0197, 0.9999999999999999)","(C0120, 0.8793075683750426)"
C0008,"(C0162, 0.9269408763160711)","(C0059, 0.9089160837878303)","(C0154, 0.8823041196107578)"
C0009,"(C0040, 1.0)","(C0092, 0.9378332740160448)","(C0029, 0.880019068160241)"
C0010,"(C0176, 0.9368731512468093)","(C0083, 0.9368731512468093)","(C0077, 0.9368731512468093)"
