### **Import Libraries**

In [21]:
import pandas as pd
from sklearn.preprocessing import StandardScaler
from sklearn.metrics.pairwise import cosine_similarity
from datetime import datetime

### **Loading Datasets**

In [2]:
customers = pd.read_csv("Customers.csv")
transactions = pd.read_csv("Transactions.csv")
products = pd.read_csv("Products.csv")

In [3]:
data = transactions.merge(customers, on="CustomerID").merge(products, on="ProductID")

In [4]:
data.head()

Unnamed: 0,TransactionID,CustomerID,ProductID,TransactionDate,Quantity,TotalValue,Price_x,CustomerName,Region,SignupDate,ProductName,Category,Price_y
0,T00001,C0199,P067,2024-08-25 12:38:23,1,300.68,300.68,Andrea Jenkins,Europe,2022-12-03,ComfortLiving Bluetooth Speaker,Electronics,300.68
1,T00112,C0146,P067,2024-05-27 22:23:54,1,300.68,300.68,Brittany Harvey,Asia,2024-09-04,ComfortLiving Bluetooth Speaker,Electronics,300.68
2,T00166,C0127,P067,2024-04-25 07:38:55,1,300.68,300.68,Kathryn Stevens,Europe,2024-04-04,ComfortLiving Bluetooth Speaker,Electronics,300.68
3,T00272,C0087,P067,2024-03-26 22:55:37,2,601.36,300.68,Travis Campbell,South America,2024-04-11,ComfortLiving Bluetooth Speaker,Electronics,300.68
4,T00363,C0070,P067,2024-03-21 15:10:10,3,902.04,300.68,Timothy Perez,Europe,2022-03-15,ComfortLiving Bluetooth Speaker,Electronics,300.68


### **Feature Engineering**

In [24]:
agg_features = data.groupby("CustomerID").agg({
    "TotalValue": "mean",
    "TransactionID": "count",
    "Quantity": "mean",
    "Category": lambda x: x.mode()[0]
}).rename(columns={
    "TotalValue": "AvgTransactionValue",
    "TransactionID": "TransactionCount",
    "Quantity": "AvgQuantity",
    "Category": "MostPurchasedCategory"
})

In [25]:
agg_features.head()

Unnamed: 0_level_0,AvgTransactionValue,TransactionCount,AvgQuantity,MostPurchasedCategory
CustomerID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
C0001,670.904,5,2.4,Electronics
C0002,465.685,4,2.5,Clothing
C0003,681.345,4,3.5,Home Decor
C0004,669.36,8,2.875,Books
C0005,678.08,3,2.333333,Electronics


#### **Merging Profile Data of Each Customer**

In [41]:
profile_features = customers.set_index("CustomerID")
final_features = profile_features.join(agg_features)

In [42]:
final_features.head()

Unnamed: 0_level_0,CustomerName,Region,SignupDate,AvgTransactionValue,TransactionCount,AvgQuantity,MostPurchasedCategory
CustomerID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
C0001,Lawrence Carroll,South America,2022-07-10,670.904,5.0,2.4,Electronics
C0002,Elizabeth Lutz,Asia,2022-02-13,465.685,4.0,2.5,Clothing
C0003,Michael Rivera,South America,2024-03-07,681.345,4.0,3.5,Home Decor
C0004,Kathleen Rodriguez,South America,2022-10-09,669.36,8.0,2.875,Books
C0005,Laura Weber,Asia,2022-08-15,678.08,3.0,2.333333,Electronics


#### **Encoding and Scaling**

In [43]:
# Convert 'SignupDate' to days since today
today_date = pd.to_datetime(datetime.today().strftime('%Y-%m-%d'))
final_features['RelationshipAge'] = (today_date - pd.to_datetime(final_features['SignupDate'])).dt.days

# Drop the original 'SignupDate' column
final_features = final_features.drop(columns=["SignupDate"], errors='ignore')

# One-hot encode the categorical columns
final_features_encoded = pd.get_dummies(final_features, columns=["Region", "MostPurchasedCategory"], drop_first=True)

# Drop irrelevant column CustomerName
final_features_encoded = final_features_encoded.drop(columns=["CustomerName"], errors='ignore')

In [44]:
final_features_encoded.isnull().sum()

AvgTransactionValue                  1
TransactionCount                     1
AvgQuantity                          1
RelationshipAge                      0
Region_Europe                        0
Region_North America                 0
Region_South America                 0
MostPurchasedCategory_Clothing       0
MostPurchasedCategory_Electronics    0
MostPurchasedCategory_Home Decor     0
dtype: int64

In [45]:
# Fill missing numeric values with column means
final_features_encoded = final_features_encoded.fillna(final_features_encoded.mean())

In [46]:
# Scaling numeric columns
scaler = StandardScaler()
scaled_features = scaler.fit_transform(final_features_encoded)
scaled_features_df = pd.DataFrame(scaled_features, columns=final_features_encoded.columns)

#### **Calculate Cosing Similarity Matrix**

In [47]:
similarity_matrix = cosine_similarity(scaled_features)
similarity_df = pd.DataFrame(similarity_matrix, index=final_features.index, columns=final_features.index)

#### **Identifying Top-3 Similar Customer for First 20 Customers**

In [50]:
lookalike_results = {}
for customer_id in similarity_df.index:
    similar_customers = similarity_df[customer_id].nlargest(4).iloc[1:]
    lookalike_results[customer_id] = list(zip(similar_customers.index, similar_customers.values))


#### **Saving Lookalike results to a CSV File**

In [52]:
lookalike_df = pd.DataFrame({
    "CustomerID": lookalike_results.keys(),
    "Lookalikes": lookalike_results.values()
})
lookalike_df.to_csv("Lookalike.csv", index=False)