In [16]:
import pandas as pd
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.metrics.pairwise import cosine_similarity

In [2]:
cust_df = pd.read_csv("Customers.csv")
prod_df = pd.read_csv("Products.csv")
trans_df = pd.read_csv("Transactions.csv")

In [6]:
cust_df.head()

Unnamed: 0,CustomerID,CustomerName,Region,SignupDate
0,C0001,Lawrence Carroll,South America,2022-07-10
1,C0002,Elizabeth Lutz,Asia,2022-02-13
2,C0003,Michael Rivera,South America,2024-03-07
3,C0004,Kathleen Rodriguez,South America,2022-10-09
4,C0005,Laura Weber,Asia,2022-08-15


In [7]:
prod_df.head()

Unnamed: 0,ProductID,ProductName,Category,Price
0,P001,ActiveWear Biography,Books,169.3
1,P002,ActiveWear Smartwatch,Electronics,346.3
2,P003,ComfortLiving Biography,Books,44.12
3,P004,BookWorld Rug,Home Decor,95.69
4,P005,TechPro T-Shirt,Clothing,429.31


In [8]:
trans_df.head()

Unnamed: 0,TransactionID,CustomerID,ProductID,TransactionDate,Quantity,TotalValue,Price
0,T00001,C0199,P067,2024-08-25 12:38:23,1,300.68,300.68
1,T00112,C0146,P067,2024-05-27 22:23:54,1,300.68,300.68
2,T00166,C0127,P067,2024-04-25 07:38:55,1,300.68,300.68
3,T00272,C0087,P067,2024-03-26 22:55:37,2,601.36,300.68
4,T00363,C0070,P067,2024-03-21 15:10:10,3,902.04,300.68


In [3]:
transactions_products = pd.merge(trans_df, prod_df, on='ProductID', how='left')

In [5]:
transactions_products.head()

Unnamed: 0,TransactionID,CustomerID,ProductID,TransactionDate,Quantity,TotalValue,Price_x,ProductName,Category,Price_y
0,T00001,C0199,P067,2024-08-25 12:38:23,1,300.68,300.68,ComfortLiving Bluetooth Speaker,Electronics,300.68
1,T00112,C0146,P067,2024-05-27 22:23:54,1,300.68,300.68,ComfortLiving Bluetooth Speaker,Electronics,300.68
2,T00166,C0127,P067,2024-04-25 07:38:55,1,300.68,300.68,ComfortLiving Bluetooth Speaker,Electronics,300.68
3,T00272,C0087,P067,2024-03-26 22:55:37,2,601.36,300.68,ComfortLiving Bluetooth Speaker,Electronics,300.68
4,T00363,C0070,P067,2024-03-21 15:10:10,3,902.04,300.68,ComfortLiving Bluetooth Speaker,Electronics,300.68


In [10]:
customer_transactions = pd.merge(transactions_products, cust_df, on='CustomerID', how='left')

In [11]:
customer_transactions.head()

Unnamed: 0,TransactionID,CustomerID,ProductID,TransactionDate,Quantity,TotalValue,Price_x,ProductName,Category,Price_y,CustomerName,Region,SignupDate
0,T00001,C0199,P067,2024-08-25 12:38:23,1,300.68,300.68,ComfortLiving Bluetooth Speaker,Electronics,300.68,Andrea Jenkins,Europe,2022-12-03
1,T00112,C0146,P067,2024-05-27 22:23:54,1,300.68,300.68,ComfortLiving Bluetooth Speaker,Electronics,300.68,Brittany Harvey,Asia,2024-09-04
2,T00166,C0127,P067,2024-04-25 07:38:55,1,300.68,300.68,ComfortLiving Bluetooth Speaker,Electronics,300.68,Kathryn Stevens,Europe,2024-04-04
3,T00272,C0087,P067,2024-03-26 22:55:37,2,601.36,300.68,ComfortLiving Bluetooth Speaker,Electronics,300.68,Travis Campbell,South America,2024-04-11
4,T00363,C0070,P067,2024-03-21 15:10:10,3,902.04,300.68,ComfortLiving Bluetooth Speaker,Electronics,300.68,Timothy Perez,Europe,2022-03-15


In [20]:
customer_features = customer_transactions.groupby('CustomerID').agg(
        total_spent=('TotalValue', 'sum'),
        avg_spent=('TotalValue', 'mean'),
        total_quantity=('Quantity', 'sum'),
        num_transactions=('TransactionID', 'count'),
        avg_quantity_per_transaction=('Quantity', 'mean'),
        unique_products=('ProductID', 'nunique'),
        fav_category=('Category', lambda x: x.mode()[0] if not x.mode().empty else None),
        region=('Region', 'first'),
        purchase_frequency=('TransactionDate', lambda x: (pd.to_datetime(x.max()) - pd.to_datetime(x.min())).days)
    ).reset_index()

In [21]:
customer_features.head()

Unnamed: 0,CustomerID,total_spent,avg_spent,total_quantity,num_transactions,avg_quantity_per_transaction,unique_products,fav_category,region,purchase_frequency
0,C0001,3354.52,670.904,12,5,2.4,5,Electronics,South America,288
1,C0002,1862.74,465.685,10,4,2.5,4,Clothing,Asia,278
2,C0003,2725.38,681.345,14,4,3.5,4,Home Decor,South America,188
3,C0004,5354.88,669.36,23,8,2.875,8,Books,South America,299
4,C0005,2034.24,678.08,7,3,2.333333,3,Electronics,Asia,233


In [22]:
encoder = OneHotEncoder()
categorical_features = encoder.fit_transform(customer_features[['fav_category', 'region']]).toarray()
encoded_feature_names = encoder.get_feature_names_out(['fav_category', 'region'])
categorical_df = pd.DataFrame(categorical_features, columns=encoded_feature_names)

In [23]:
product_encoder = OneHotEncoder()
product_features = product_encoder.fit_transform(customer_transactions[['ProductID']]).toarray()
product_encoded_names = product_encoder.get_feature_names_out(['ProductID'])
product_df = pd.DataFrame(product_features, columns=product_encoded_names)

In [25]:
features = ["fav_category", "total_quantity", "total_spent", "avg_spent", "region"]

In [38]:
final_features = pd.concat([
    customer_features[['CustomerID', 'total_spent', 'total_quantity']].reset_index(drop=True),
    categorical_df,
    product_df.groupby(customer_transactions['CustomerID']).sum().reset_index(drop=True)
], axis=1)

In [39]:
final_features.head()

Unnamed: 0,CustomerID,total_spent,total_quantity,fav_category_Books,fav_category_Clothing,fav_category_Electronics,fav_category_Home Decor,region_Asia,region_Europe,region_North America,...,ProductID_P091,ProductID_P092,ProductID_P093,ProductID_P094,ProductID_P095,ProductID_P096,ProductID_P097,ProductID_P098,ProductID_P099,ProductID_P100
0,C0001,3354.52,12,0.0,0.0,1.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0
1,C0002,1862.74,10,0.0,1.0,0.0,0.0,1.0,0.0,0.0,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0
2,C0003,2725.38,14,0.0,0.0,0.0,1.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,C0004,5354.88,23,1.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
4,C0005,2034.24,7,0.0,0.0,1.0,0.0,1.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [40]:
feature_matrix = final_features.drop(columns=['CustomerID']).values
similarity_matrix = cosine_similarity(feature_matrix)

In [41]:
recommendations = {}
for idx, customer_id in enumerate(final_features['CustomerID']):
    similarities = list(enumerate(similarity_matrix[idx]))
    similarities = sorted(similarities, key=lambda x: x[1], reverse=True)
    top_3 = [(final_features.iloc[i]['CustomerID'], score) for i, score in similarities[1:4]]
    recommendations[customer_id] = top_3

In [47]:
lookalike_data = []
for cust_id, recs in recommendations.items():
    lookalike_data.append({
        'cust_id': cust_id,
        'recommendations': [{"cust_id": rec[0], "score": rec[1]} for rec in recs]
    })
lookalike_df = pd.DataFrame(lookalike_data)

In [48]:
first_20_customers = cust_df['CustomerID'][:20].tolist()

In [49]:
filtered_lookalike_df = lookalike_df[lookalike_df['cust_id'].isin(first_20_customers)]

In [50]:
filtered_lookalike_df.to_csv('Lookalike.csv', index=False)

In [51]:
filtered_lookalike_df

Unnamed: 0,cust_id,recommendations
0,C0001,"[{'cust_id': 'C0102', 'score': 0.9999996635818..."
1,C0002,"[{'cust_id': 'C0109', 'score': 0.9999991703701..."
2,C0003,"[{'cust_id': 'C0136', 'score': 0.9999994332780..."
3,C0004,"[{'cust_id': 'C0175', 'score': 0.9999997543752..."
4,C0005,"[{'cust_id': 'C0148', 'score': 0.9999994482794..."
5,C0006,"[{'cust_id': 'C0171', 'score': 0.9999998808855..."
6,C0007,"[{'cust_id': 'C0054', 'score': 0.9999996367149..."
7,C0008,"[{'cust_id': 'C0160', 'score': 0.9999995774550..."
8,C0009,"[{'cust_id': 'C0129', 'score': 0.9999972610665..."
9,C0010,"[{'cust_id': 'C0049', 'score': 0.9999984903590..."
