In [1]:
# Import required libraries
import pandas as pd
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA

load the data set

In [2]:
# Load the datasets
customers = pd.read_csv('/content/Customers.csv')
products= pd.read_csv('/content/Products.csv')
transactions = pd.read_csv('/content/Transactions.csv')

Data Preprocessing

In [22]:
# Merge datasets to create a comprehensive dataset
#df = pd.merge(transactions, customers, on='CustomerID', how='left')
#df = pd.merge(df, products, on='ProductID', how='left')
#print(df)
# Merge datasets to create a comprehensive dataset
df = pd.merge(transactions, customers[['CustomerID', 'Region']], on='CustomerID', how='left')
df = pd.merge(df, products, on='ProductID', how='left')
print(df)

    TransactionID CustomerID ProductID      TransactionDate  Quantity  \
0          T00001      C0199      P067  2024-08-25 12:38:23         1   
1          T00112      C0146      P067  2024-05-27 22:23:54         1   
2          T00166      C0127      P067  2024-04-25 07:38:55         1   
3          T00272      C0087      P067  2024-03-26 22:55:37         2   
4          T00363      C0070      P067  2024-03-21 15:10:10         3   
..            ...        ...       ...                  ...       ...   
995        T00496      C0118      P037  2024-10-24 08:30:27         1   
996        T00759      C0059      P037  2024-06-04 02:15:24         3   
997        T00922      C0018      P037  2024-04-05 13:05:32         4   
998        T00959      C0115      P037  2024-09-29 10:16:02         2   
999        T00992      C0024      P037  2024-04-21 10:52:24         1   

     TotalValue  Price_x         Region                      ProductName  \
0        300.68   300.68         Europe  Comfor

In [23]:
# Aggregate transaction features for each customer
customer_features = df.groupby('CustomerID').agg(
    total_spend=('TotalValue', 'sum'),
    avg_transaction_value=('TotalValue', 'mean'),
    transaction_count=('TransactionID', 'count'),
    product_diversity=('ProductID', 'nunique'),
    region=('Region', 'first')  # Include region as a categorical feature
).reset_index()
print(customer_features)

    CustomerID  total_spend  avg_transaction_value  transaction_count  \
0        C0001      3354.52             670.904000                  5   
1        C0002      1862.74             465.685000                  4   
2        C0003      2725.38             681.345000                  4   
3        C0004      5354.88             669.360000                  8   
4        C0005      2034.24             678.080000                  3   
..         ...          ...                    ...                ...   
194      C0196      4982.88            1245.720000                  4   
195      C0197      1928.65             642.883333                  3   
196      C0198       931.83             465.915000                  2   
197      C0199      1979.28             494.820000                  4   
198      C0200      4758.60             951.720000                  5   

     product_diversity         region  
0                    5  South America  
1                    4           Asia  
2  

In [None]:
# One-hot encode categorical features
customer_features = pd.get_dummies(customer_features, columns=['region'])

In [27]:
 #Feature Scaling
scaler = StandardScaler()
scaled_features = scaler.fit_transform(customer_features.drop(columns=['CustomerID']))


In [28]:
# Compute Cosine Similarity
similarity_matrix = cosine_similarity(scaled_features)

In [44]:
# Create a lookalike map for the first 20 customers
lookalike_map = {}
for i in range(20):  # For CustomerIDs C0001 - C0020
    customer_id = customer_features.iloc[i]['CustomerID']
    similarity_scores = list(enumerate(similarity_matrix[i]))
    # Sort by similarity score in descending order (excluding the customer itself)
    similar_customers = sorted(similarity_scores, key=lambda x: x[1], reverse=True)[1:4]
    # Get top 3 similar customers with their similarity scores
    lookalike_map[customer_id] = [(customer_features.iloc[j[0]]['CustomerID'], j[1]) for j in similar_customers]


In [40]:
# Convert the lookalike_map to a DataFrame
lookalike_data = []

# Populate the list with structured data
for customer_id, lookalikes in lookalike_map.items():
    for similar_customer, score in lookalikes:
        lookalike_data.append({
            'CustomerID': customer_id,
            'SimilarCustomerID': similar_customer,
            'SimilarityScore': score
        })


In [41]:
# Create a DataFrame from the structured data
lookalike_df = pd.DataFrame(lookalike_data)


In [42]:
# Save the DataFrame to a CSV file
lookalike_df.to_csv('Lookalike.csv', index=False)

In [43]:
# Print a success message
print("Lookalike.csv has been successfully saved!")

Lookalike.csv has been successfully saved!
