In [1]:
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.preprocessing import StandardScaler
import pandas as pd
import numpy as np


In [2]:
# Load the datasets
customers = pd.read_csv('Customers.csv')
products = pd.read_csv('Products.csv')
transactions = pd.read_csv('Transactions.csv')

# Merge transactions with products
transactions_products = transactions.merge(products, on='ProductID')

# Merge the above with customers
data = transactions_products.merge(customers, on='CustomerID')

# Preview the merged dataset
print(data.head())


  TransactionID CustomerID ProductID      TransactionDate  Quantity  \
0        T00001      C0199      P067  2024-08-25 12:38:23         1   
1        T00112      C0146      P067  2024-05-27 22:23:54         1   
2        T00166      C0127      P067  2024-04-25 07:38:55         1   
3        T00272      C0087      P067  2024-03-26 22:55:37         2   
4        T00363      C0070      P067  2024-03-21 15:10:10         3   

   TotalValue  Price_x                      ProductName     Category  Price_y  \
0      300.68   300.68  ComfortLiving Bluetooth Speaker  Electronics   300.68   
1      300.68   300.68  ComfortLiving Bluetooth Speaker  Electronics   300.68   
2      300.68   300.68  ComfortLiving Bluetooth Speaker  Electronics   300.68   
3      601.36   300.68  ComfortLiving Bluetooth Speaker  Electronics   300.68   
4      902.04   300.68  ComfortLiving Bluetooth Speaker  Electronics   300.68   

      CustomerName         Region  SignupDate  
0   Andrea Jenkins         Europe  202

**Step 2: Feature Engineering**

In [4]:
# Display the columns in the merged DataFrame
print(data.columns)


Index(['TransactionID', 'CustomerID', 'ProductID', 'TransactionDate',
       'Quantity', 'TotalValue', 'Price_x', 'ProductName', 'Category',
       'Price_y', 'CustomerName', 'Region', 'SignupDate'],
      dtype='object')


In [5]:
# Rename Price_y to Price
data.rename(columns={'Price_y': 'Price'}, inplace=True)


In [6]:
# Aggregating transaction features
customer_features = data.groupby('CustomerID').agg({
    'TotalValue': ['sum', 'mean'],
    'Quantity': 'sum',
    'Price': 'mean'
}).reset_index()

# Rename columns for clarity
customer_features.columns = ['CustomerID', 'TotalSpend', 'AvgTransactionValue', 'TotalQuantity', 'AvgPrice']

# Merge customer profile data
customer_features = customer_features.merge(customers, on='CustomerID')

# Convert categorical columns (like Region) to numeric
customer_features = pd.get_dummies(customer_features, columns=['Region'], drop_first=True)

# Preview the features
print(customer_features.head())


  CustomerID  TotalSpend  AvgTransactionValue  TotalQuantity    AvgPrice  \
0      C0001     3354.52              670.904             12  278.334000   
1      C0002     1862.74              465.685             10  208.920000   
2      C0003     2725.38              681.345             14  195.707500   
3      C0004     5354.88              669.360             23  240.636250   
4      C0005     2034.24              678.080              7  291.603333   

         CustomerName  SignupDate  Region_Europe  Region_North America  \
0    Lawrence Carroll  2022-07-10          False                 False   
1      Elizabeth Lutz  2022-02-13          False                 False   
2      Michael Rivera  2024-03-07          False                 False   
3  Kathleen Rodriguez  2022-10-09          False                 False   
4         Laura Weber  2022-08-15          False                 False   

   Region_South America  
0                  True  
1                 False  
2                  T

In [7]:
print(customer_features.head())


  CustomerID  TotalSpend  AvgTransactionValue  TotalQuantity    AvgPrice  \
0      C0001     3354.52              670.904             12  278.334000   
1      C0002     1862.74              465.685             10  208.920000   
2      C0003     2725.38              681.345             14  195.707500   
3      C0004     5354.88              669.360             23  240.636250   
4      C0005     2034.24              678.080              7  291.603333   

         CustomerName  SignupDate  Region_Europe  Region_North America  \
0    Lawrence Carroll  2022-07-10          False                 False   
1      Elizabeth Lutz  2022-02-13          False                 False   
2      Michael Rivera  2024-03-07          False                 False   
3  Kathleen Rodriguez  2022-10-09          False                 False   
4         Laura Weber  2022-08-15          False                 False   

   Region_South America  
0                  True  
1                 False  
2                  T

**Step 3: Compute Similarity**

In [8]:
# Normalize numerical features
scaler = StandardScaler()
numerical_cols = ['TotalSpend', 'AvgTransactionValue', 'TotalQuantity', 'AvgPrice']
customer_features[numerical_cols] = scaler.fit_transform(customer_features[numerical_cols])


In [9]:
# Extract feature matrix for similarity computation
feature_matrix = customer_features.drop(['CustomerID', 'CustomerName', 'SignupDate'], axis=1)

# Compute cosine similarity
similarity_matrix = cosine_similarity(feature_matrix)

# Convert similarity matrix to a DataFrame
similarity_df = pd.DataFrame(similarity_matrix, index=customer_features['CustomerID'], columns=customer_features['CustomerID'])


**Step 4: Generate Lookalike Recommendations**

In [11]:
# Function to get top 3 similar customers
def get_top_similar_customers(similarity_df, customer_id, top_n=3):
    similar_customers = similarity_df.loc[customer_id].sort_values(ascending=False).iloc[1:top_n+1]
    return [(idx, score) for idx, score in similar_customers.items()]

# Generate recommendations for customers C0001 - C0020
lookalike_results = {}
for customer_id in customer_features['CustomerID'].iloc[:20]:
    lookalike_results[customer_id] = get_top_similar_customers(similarity_df, customer_id)

# Convert results to a DataFrame
lookalike_df = pd.DataFrame.from_dict(lookalike_results, orient='index', columns=['Similar1', 'Similar2', 'Similar3'])

# Save results to CSV
lookalike_df.to_csv('Ramteja Reddy_Lookalike.csv', index_label='CustomerID')

print("Lookalike recommendations saved to 'Ramteja Reddy_Lookalike.csv'")


Lookalike recommendations saved to 'Ramteja Reddy_Lookalike.csv'
