In [6]:
pip install scikit-learn


Collecting scikit-learn
  Using cached scikit_learn-1.6.1-cp313-cp313-macosx_12_0_arm64.whl.metadata (31 kB)
Collecting scipy>=1.6.0 (from scikit-learn)
  Using cached scipy-1.15.1-cp313-cp313-macosx_14_0_arm64.whl.metadata (61 kB)
Collecting joblib>=1.2.0 (from scikit-learn)
  Using cached joblib-1.4.2-py3-none-any.whl.metadata (5.4 kB)
Collecting threadpoolctl>=3.1.0 (from scikit-learn)
  Using cached threadpoolctl-3.5.0-py3-none-any.whl.metadata (13 kB)
Using cached scikit_learn-1.6.1-cp313-cp313-macosx_12_0_arm64.whl (11.1 MB)
Using cached joblib-1.4.2-py3-none-any.whl (301 kB)
Using cached scipy-1.15.1-cp313-cp313-macosx_14_0_arm64.whl (24.8 MB)
Using cached threadpoolctl-3.5.0-py3-none-any.whl (18 kB)
Installing collected packages: threadpoolctl, scipy, joblib, scikit-learn
Successfully installed joblib-1.4.2 scikit-learn-1.6.1 scipy-1.15.1 threadpoolctl-3.5.0
Note: you may need to restart the kernel to use updated packages.


In [7]:
import pandas as pd
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.preprocessing import StandardScaler

In [8]:
customers = pd.read_csv('/Users/shreyas/Desktop/untitled folder/ecommerce_analysis/data/Customers.csv')
products = pd.read_csv('/Users/shreyas/Desktop/untitled folder/ecommerce_analysis/data/Products.csv')
transactions = pd.read_csv('/Users/shreyas/Desktop/untitled folder/ecommerce_analysis/data/Transactions.csv')

In [9]:
customers['SignupDate'] = pd.to_datetime(customers['SignupDate'])
transactions['TransactionDate'] = pd.to_datetime(transactions['TransactionDate'])

In [10]:
merged_data = transactions.merge(customers, on='CustomerID', how='inner').merge(products, on='ProductID', how='inner')


In [11]:
customer_profiles = merged_data.groupby('CustomerID').agg(
    total_purchases=('Quantity', 'sum'),
    total_spending=('TotalValue', 'sum'),
    avg_transaction_value=('TotalValue', 'mean'),
    preferred_category=('Category', lambda x: x.mode()[0]), 
    region=('Region', 'first') 
).reset_index()

In [12]:
customer_profiles = pd.get_dummies(customer_profiles, columns=['preferred_category', 'region'], drop_first=True)

In [13]:
scaler = StandardScaler()
numerical_features = ['total_purchases', 'total_spending', 'avg_transaction_value']
customer_profiles[numerical_features] = scaler.fit_transform(customer_profiles[numerical_features])

In [15]:
customer_ids = customer_profiles['CustomerID']
feature_matrix = customer_profiles.drop(columns=['CustomerID']).values
similarity_matrix = cosine_similarity(feature_matrix)

In [16]:
lookalike_results = []

In [17]:
for i, customer_id in enumerate(customer_ids[:20]):  # Limit to first 20 customers
    # Get similarity scores for the current customer
    similarities = similarity_matrix[i]
    # Exclude the current customer and sort by similarity score
    similar_customers = sorted(
        [(customer_ids[j], score) for j, score in enumerate(similarities) if j != i],
        key=lambda x: x[1],
        reverse=True
    )
    # Select the top 3 similar customers
    top_3 = similar_customers[:3]
    lookalike_results.append({
        'CustomerID': customer_id,
        'Lookalike1': top_3[0][0], 'Score1': round(top_3[0][1], 2),
        'Lookalike2': top_3[1][0], 'Score2': round(top_3[1][1], 2),
        'Lookalike3': top_3[2][0], 'Score3': round(top_3[2][1], 2)
    })

In [18]:
lookalike_df = pd.DataFrame(lookalike_results)

In [21]:
lookalike_df.to_csv('/Users/shreyas/Desktop/untitled folder/ecommerce_analysis/outputs/Lookalike.csv', index=False)

In [20]:
print(lookalike_df)

   CustomerID Lookalike1  Score1 Lookalike2  Score2 Lookalike3  Score3
0       C0001      C0190    0.94      C0048    0.94      C0181    0.90
1       C0002      C0088    0.99      C0077    0.90      C0083    0.89
2       C0003      C0052    0.90      C0152    0.87      C0195    0.84
3       C0004      C0165    0.98      C0169    0.96      C0175    0.89
4       C0005      C0146    0.98      C0186    0.97      C0130    0.89
5       C0006      C0168    0.98      C0187    0.95      C0171    0.94
6       C0007      C0140    0.97      C0115    0.92      C0020    0.86
7       C0008      C0160    0.78      C0024    0.75      C0194    0.75
8       C0009      C0198    0.98      C0103    0.94      C0062    0.93
9       C0010      C0111    0.95      C0062    0.92      C0103    0.89
10      C0011      C0126    0.93      C0137    0.88      C0187    0.81
11      C0012      C0104    0.97      C0113    0.93      C0195    0.93
12      C0013      C0099    0.98      C0108    0.92      C0143    0.88
13    