In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.cluster import KMeans
from sklearn.preprocessing import StandardScaler

#Data Upload

In [2]:
customers = pd.read_csv("/content/drive/MyDrive/Data Science/Customers.csv")
products = pd.read_csv("/content/drive/MyDrive/Data Science/Products.csv")
transactions = pd.read_csv("/content/drive/MyDrive/Data Science/Transactions.csv")

In [4]:
customers['SignupDate'] = pd.to_datetime(customers['SignupDate'])
transactions['TransactionDate'] = pd.to_datetime(transactions['TransactionDate'])

In [5]:
merged_data = transactions.merge(customers, on='CustomerID').merge(products, on='ProductID')

In [8]:
merged_data.head()

Unnamed: 0,TransactionID,CustomerID,ProductID,TransactionDate,Quantity,TotalValue,Price_x,CustomerName,Region,SignupDate,ProductName,Category,Price_y
0,T00001,C0199,P067,2024-08-25 12:38:23,1,300.68,300.68,Andrea Jenkins,Europe,2022-12-03,ComfortLiving Bluetooth Speaker,Electronics,300.68
1,T00112,C0146,P067,2024-05-27 22:23:54,1,300.68,300.68,Brittany Harvey,Asia,2024-09-04,ComfortLiving Bluetooth Speaker,Electronics,300.68
2,T00166,C0127,P067,2024-04-25 07:38:55,1,300.68,300.68,Kathryn Stevens,Europe,2024-04-04,ComfortLiving Bluetooth Speaker,Electronics,300.68
3,T00272,C0087,P067,2024-03-26 22:55:37,2,601.36,300.68,Travis Campbell,South America,2024-04-11,ComfortLiving Bluetooth Speaker,Electronics,300.68
4,T00363,C0070,P067,2024-03-21 15:10:10,3,902.04,300.68,Timothy Perez,Europe,2022-03-15,ComfortLiving Bluetooth Speaker,Electronics,300.68


#Lookalike Model

In [32]:
customer_features = merged_data.groupby('CustomerID').agg({
    'TotalValue': 'sum',
    'Quantity': 'sum',
    'Price_x': 'mean',
    'ProductID': lambda x: ' '.join(map(str, x))
}).reset_index()

In [33]:
latest_transaction = merged_data.groupby('CustomerID')['TransactionDate'].max()
customer_features = customer_features.merge(latest_transaction, on='CustomerID')
customer_features['Recency'] = (customer_features['TransactionDate'].max() - customer_features['TransactionDate']).dt.days
customer_features.drop('TransactionDate', axis=1, inplace=True)

In [35]:
customer_features['Profile'] = customer_features['TotalValue'].astype(str) + ' ' + \
                                customer_features['Quantity'].astype(str) + ' ' + \
                                customer_features['Price_x'].astype(str) + ' ' + \
                                customer_features['Recency'].astype(str) + ' ' + \
                                customer_features['ProductID']

#Similarity calculation

In [36]:
from sklearn.feature_extraction.text import TfidfVectorizer
vectorizer = TfidfVectorizer(max_features=100, ngram_range=(1, 2), min_df=2)
profile_matrix = vectorizer.fit_transform(customer_features['Profile'])

In [37]:
similarity_matrix = cosine_similarity(profile_matrix)
similarity_df = pd.DataFrame(similarity_matrix, index=customer_features['CustomerID'], columns=customer_features['CustomerID'])

In [38]:
lookalike_results = {}
true_scores = []
for customer_id in customer_features['CustomerID'][:20]:
    similar_customers = similarity_df[customer_id].sort_values(ascending=False)[1:4]
    lookalike_results[customer_id] = [(cust, round(score, 2)) for cust, score in zip(similar_customers.index, similar_customers.values)]
    true_scores.extend(similar_customers.values)

In [39]:
lookalike_df = pd.DataFrame({
    'CustomerID': lookalike_results.keys(),
    'Lookalike1': [x[0][0] for x in lookalike_results.values()],
    'Score1': [x[0][1] for x in lookalike_results.values()],
    'Lookalike2': [x[1][0] for x in lookalike_results.values()],
    'Score2': [x[1][1] for x in lookalike_results.values()],
    'Lookalike3': [x[2][0] for x in lookalike_results.values()],
    'Score3': [x[2][1] for x in lookalike_results.values()]
})
lookalike_df.to_csv("Soundarya_Rathod_Lookalike.csv", index=False)

In [28]:
from google.colab import files
files.download('Soundarya_Rathod_Lookalike.csv')

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>