In [1]:
# Import libraries
import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics.pairwise import cosine_similarity

In [2]:
# Load datasets
customers = pd.read_csv('Customers.csv')
transactions = pd.read_csv('Transactions.csv')
products = pd.read_csv('Products.csv')

In [3]:
# Merge datasets
merged_data = transactions.merge(customers, on='CustomerID').merge(products, on='ProductID')

In [4]:
# Create customer profiles
customer_profiles = merged_data.groupby('CustomerID').agg({
    'TotalValue': 'sum',
    'Quantity': 'sum',
    'Category': lambda x: ' '.join(x),  # Combine categories as a string
    'Region': 'first'
}).reset_index()

In [5]:
# Encode text data (Region + Category)
customer_profiles['ProfileText'] = customer_profiles['Region'] + ' ' + customer_profiles['Category']
print(customer_profiles.head())

  CustomerID  TotalValue  Quantity  \
0      C0001     3354.52        12   
1      C0002     1862.74        10   
2      C0003     2725.38        14   
3      C0004     5354.88        23   
4      C0005     2034.24         7   

                                            Category         Region  \
0  Home Decor Electronics Electronics Electronics...  South America   
1            Home Decor Home Decor Clothing Clothing           Asia   
2         Clothing Home Decor Home Decor Electronics  South America   
3  Electronics Home Decor Books Books Home Decor ...  South America   
4                 Electronics Home Decor Electronics           Asia   

                                         ProfileText  
0  South America Home Decor Electronics Electroni...  
1       Asia Home Decor Home Decor Clothing Clothing  
2  South America Clothing Home Decor Home Decor E...  
3  South America Electronics Home Decor Books Boo...  
4            Asia Electronics Home Decor Electronics  


In [6]:
# Vectorize the profiles
vectorizer = CountVectorizer()
profile_vectors = vectorizer.fit_transform(customer_profiles['ProfileText'])

In [7]:
# Compute similarity matrix
similarity_matrix = cosine_similarity(profile_vectors)

In [8]:
# Generate lookalikes
lookalike_data = {}
for idx, customer_id in enumerate(customer_profiles['CustomerID']):
    similar_indices = similarity_matrix[idx].argsort()[::-1][1:4]  # Top 3 similar customers
    lookalikes = [
        (customer_profiles['CustomerID'][i], similarity_matrix[idx][i]) 
        for i in similar_indices
    ]
    lookalike_data[customer_id] = lookalikes

In [9]:
# Save lookalike results to a CSV file
lookalike_df = pd.DataFrame([
    {'CustomerID': cust_id, 'Lookalikes': str(lookalike_data[cust_id])}
    for cust_id in lookalike_data
])
lookalike_df.to_csv('Sahil_Gupta_Lookalike.csv', index=False)

print(lookalike_df.head())
print("Lookalike model results saved to 'Sahil_Gupta_Lookalike.csv'")

  CustomerID                                         Lookalikes
0      C0001  [('C0091', 0.9331389496316869), ('C0190', 0.92...
1      C0002  [('C0134', 0.9805806756909203), ('C0159', 0.96...
2      C0003  [('C0003', 1.0000000000000002), ('C0031', 1.00...
3      C0004  [('C0113', 0.9697622757528539), ('C0047', 0.95...
4      C0005  [('C0007', 0.9999999999999998), ('C0140', 0.94...
Lookalike model results saved to 'Sahil_Gupta_Lookalike.csv'
