**Lookalike Model**

Importing Libraries

In [1]:
import pandas as pd
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.preprocessing import StandardScaler

Load data

In [17]:
customers = pd.read_csv("Customers.csv")
products = pd.read_csv("Products.csv")
transactions = pd.read_csv("Transactions.csv")

print("Customers Data:\n", customers.head())
print("\nProducts Data:\n", products.head())
print("\nTransactions Data:\n", transactions.head())

Customers Data:
   CustomerID        CustomerName         Region  SignupDate
0      C0001    Lawrence Carroll  South America  2022-07-10
1      C0002      Elizabeth Lutz           Asia  2022-02-13
2      C0003      Michael Rivera  South America  2024-03-07
3      C0004  Kathleen Rodriguez  South America  2022-10-09
4      C0005         Laura Weber           Asia  2022-08-15

Products Data:
   ProductID              ProductName     Category   Price
0      P001     ActiveWear Biography        Books  169.30
1      P002    ActiveWear Smartwatch  Electronics  346.30
2      P003  ComfortLiving Biography        Books   44.12
3      P004            BookWorld Rug   Home Decor   95.69
4      P005          TechPro T-Shirt     Clothing  429.31

Transactions Data:
   TransactionID CustomerID ProductID      TransactionDate  Quantity  \
0        T00001      C0199      P067  2024-08-25 12:38:23         1   
1        T00112      C0146      P067  2024-05-27 22:23:54         1   
2        T00166      C01

In [3]:

# print(type(transactions))  
# print(type(customers)) 

<class 'pandas.core.frame.DataFrame'>
<class 'pandas.core.frame.DataFrame'>


Data Cleaning

In [14]:
# print("Missing values in Customers Data:\n", customers.isnull().sum())
# print("Missing values in Products Data:\n", products.isnull().sum())
# print("Missing values in Transactions Data:\n", transactions.isnull().sum())

# # Converting to datetime format
# customers['SignupDate'] = pd.to_datetime(customers['SignupDate'])
# transactions['TransactionDate'] = pd.to_datetime(transactions['TransactionDate'])

# # print(customers.dtypes)
# # print(transactions.dtypes)

# # Remove duplicates if any
# customers = customers.drop_duplicates(inplace=True)
# products = products.drop_duplicates(inplace=True)
# transactions = transactions.drop_duplicates(inplace=True)


Missing values in Customers Data:
 CustomerID      0
CustomerName    0
Region          0
SignupDate      0
dtype: int64
Missing values in Products Data:
 ProductID      0
ProductName    0
Category       0
Price          0
dtype: int64
Missing values in Transactions Data:
 TransactionID      0
CustomerID         0
ProductID          0
TransactionDate    0
Quantity           0
TotalValue         0
Price              0
dtype: int64


Merging Data

In [18]:
merged_df = pd.merge(transactions, customers, on='CustomerID')
merged_df = pd.merge(merged_df, products, on='ProductID')

In [19]:
# total spend for each customer
customer_spend = merged_df.groupby('CustomerID').agg(
    total_spent=('TotalValue', 'sum'),
    num_transactions=('TransactionID', 'count'),
    num_products=('ProductID', 'nunique')
).reset_index()


customers_features = pd.merge(customers, customer_spend, on='CustomerID')

scaler = StandardScaler()
features = customers_features[['total_spent', 'num_transactions', 'num_products']]
scaled_features = scaler.fit_transform(features)

# cosine similarity
cos_sim = cosine_similarity(scaled_features)

# function top 3 lookalikes for each customer
def get_top_3_lookalikes(cos_sim, customers_features):
    lookalikes = {}
    for i, row in enumerate(cos_sim[:20]):  
        similar_customers = list(enumerate(row))
        similar_customers = sorted(similar_customers, key=lambda x: x[1], reverse=True)
       
        similar_customers = [cust for cust in similar_customers if cust[0] != i][:3]
        
        #dict
        lookalikes[customers_features['CustomerID'][i]] = [
            {'CustomerID': customers_features['CustomerID'][cust[0]], 'Score': cust[1]}
            for cust in similar_customers
        ]
    return lookalikes

# top 3
lookalike_results = get_top_3_lookalikes(cos_sim, customers_features)

# df
lookalike_df = pd.DataFrame([
    {'CustomerID': cust_id, 'Lookalikes': str(lookalikes)}
    for cust_id, lookalikes in lookalike_results.items()
])

#CSV file
lookalike_df.to_csv('Lookalike.csv', index=False)

lookalike_df.head()

Unnamed: 0,CustomerID,Lookalikes
0,C0001,"[{'CustomerID': 'C0137', 'Score': 0.9963323425..."
1,C0002,"[{'CustomerID': 'C0029', 'Score': 0.9997577919..."
2,C0003,"[{'CustomerID': 'C0178', 'Score': 0.9999491872..."
3,C0004,"[{'CustomerID': 'C0021', 'Score': 0.9999029227..."
4,C0005,"[{'CustomerID': 'C0073', 'Score': 0.9999664651..."
