In [1]:
import numpy as np 
import pandas as pd 
import matplotlib.pyplot as plt 
import seaborn as sns 

In [23]:
from sklearn.preprocessing import LabelEncoder,StandardScaler
from sklearn.metrics.pairwise import cosine_similarity

In [3]:
customers = pd.read_csv("Customers.csv")
transaction = pd.read_csv("Transactions.csv")

In [4]:
df = pd.merge(customers,transaction,on='CustomerID')

In [5]:
df.shape

(1000, 10)

In [7]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000 entries, 0 to 999
Data columns (total 10 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   CustomerID       1000 non-null   object 
 1   CustomerName     1000 non-null   object 
 2   Region           1000 non-null   object 
 3   SignupDate       1000 non-null   object 
 4   TransactionID    1000 non-null   object 
 5   ProductID        1000 non-null   object 
 6   TransactionDate  1000 non-null   object 
 7   Quantity         1000 non-null   int64  
 8   TotalValue       1000 non-null   float64
 9   Price            1000 non-null   float64
dtypes: float64(2), int64(1), object(7)
memory usage: 78.3+ KB


In [8]:
df.head()

Unnamed: 0,CustomerID,CustomerName,Region,SignupDate,TransactionID,ProductID,TransactionDate,Quantity,TotalValue,Price
0,C0001,Lawrence Carroll,South America,2022-07-10,T00015,P054,2024-01-19 03:12:55,2,114.6,57.3
1,C0001,Lawrence Carroll,South America,2022-07-10,T00932,P022,2024-09-17 09:01:18,3,412.62,137.54
2,C0001,Lawrence Carroll,South America,2022-07-10,T00085,P096,2024-04-08 00:01:00,2,614.94,307.47
3,C0001,Lawrence Carroll,South America,2022-07-10,T00445,P083,2024-05-07 03:11:44,2,911.44,455.72
4,C0001,Lawrence Carroll,South America,2022-07-10,T00436,P029,2024-11-02 17:04:16,3,1300.92,433.64


In [9]:
df['SignupDate'] = pd.to_datetime(df['SignupDate'])
df['TransactionDate'] = pd.to_datetime(df['TransactionDate'])
df['AccountAge'] = (df['TransactionDate']-df['SignupDate']).dt.days

In [13]:
customer_features = df.groupby('CustomerID').agg({
    'Region':'first', 
    'AccountAge':'max', 
    'Quantity':['sum','mean'], 
    'TotalValue': ['sum', 'mean'],  
    'Price': ['mean'], 
    'ProductID': 'nunique'
}).reset_index()

In [14]:
customer_features.columns = ['CustomerID', 'Region', 'AccountAge', 'TotalQuantity', 'AvgQuantity',
                               'TotalSpend', 'AvgSpend','AvgPrice', 'UniqueProducts']

In [19]:
label_encoder = LabelEncoder()
customer_features['Region'] = label_encoder.fit_transform(customer_features['Region'])

In [21]:
cols_to_scale = ['AccountAge', 'TotalQuantity', 'AvgQuantity','TotalSpend', 'AvgSpend','AvgPrice']
scaler = StandardScaler()
customer_features[cols_to_scale] = scaler.fit_transform(customer_features[cols_to_scale])

In [24]:
feature_matrix = customer_features.drop('CustomerID',axis=1)
similarity_matrix = cosine_similarity(feature_matrix)

In [26]:
def get_top_lookalikes(customer_idx, n=3):
        customer_similarities = similarity_matrix[customer_idx]

        similar_indices = np.argsort(customer_similarities)[::-1][1:n+1]
        similar_scores = customer_similarities[similar_indices]
        similar_ids = customer_features.iloc[similar_indices]['CustomerID'].values
        return list(zip(similar_ids, similar_scores))

In [28]:
lookalike_recommendations = {}
for i in range(20):
    customer_id = f'C{i+1:04d}'
    customer_idx = customer_features[customer_features['CustomerID'] == customer_id].index[0]
    lookalikes = get_top_lookalikes(customer_idx)
    lookalike_recommendations[customer_id] = lookalikes

In [32]:
output_rows = []
for cust_id, recommendations in lookalike_recommendations.items():
    row = {
        'CustomerID': cust_id,
        'Lookalikes': ', '.join([f"{rec[0]}({rec[1]:.3f})" for rec in recommendations])
    }
    output_rows.append(row)

output_df = pd.DataFrame(output_rows)

In [33]:
output_df

Unnamed: 0,CustomerID,Lookalikes
0,C0001,"C0011(0.994), C0152(0.991), C0107(0.991)"
1,C0002,"C0159(0.963), C0166(0.953), C0106(0.953)"
2,C0003,"C0190(0.960), C0091(0.954), C0039(0.949)"
3,C0004,"C0113(0.996), C0068(0.990), C0102(0.989)"
4,C0005,"C0159(0.964), C0061(0.935), C0007(0.929)"
5,C0006,"C0026(0.973), C0171(0.965), C0148(0.957)"
6,C0007,"C0074(0.948), C0005(0.929), C0135(0.923)"
7,C0008,"C0081(0.992), C0194(0.989), C0017(0.987)"
8,C0009,"C0128(0.961), C0119(0.926), C0198(0.886)"
9,C0010,"C0197(0.954), C0134(0.941), C0038(0.940)"


In [34]:
output_df.to_csv('Sai_Sathvik_Lookalike.csv',index=False)