In [1]:
import pandas as pd

LOADING DATASETS

In [2]:
customers = pd.read_csv("Customers.csv")
products = pd.read_csv("Products.csv")
transactions = pd.read_csv("Transactions.csv")

CHECKING FOR DATA BY DISPLAYING FEW ROWS

In [3]:
print("Customers Data")
print(customers.head())

Customers Data
  CustomerID        CustomerName         Region  SignupDate
0      C0001    Lawrence Carroll  South America  2022-07-10
1      C0002      Elizabeth Lutz           Asia  2022-02-13
2      C0003      Michael Rivera  South America  2024-03-07
3      C0004  Kathleen Rodriguez  South America  2022-10-09
4      C0005         Laura Weber           Asia  2022-08-15


In [4]:
print("Products Data")
print(products.head())

Products Data
  ProductID              ProductName     Category   Price
0      P001     ActiveWear Biography        Books  169.30
1      P002    ActiveWear Smartwatch  Electronics  346.30
2      P003  ComfortLiving Biography        Books   44.12
3      P004            BookWorld Rug   Home Decor   95.69
4      P005          TechPro T-Shirt     Clothing  429.31


In [5]:
print("Transactions Data")
print(transactions.head())

Transactions Data
  TransactionID CustomerID ProductID      TransactionDate  Quantity  \
0        T00001      C0199      P067  2024-08-25 12:38:23         1   
1        T00112      C0146      P067  2024-05-27 22:23:54         1   
2        T00166      C0127      P067  2024-04-25 07:38:55         1   
3        T00272      C0087      P067  2024-03-26 22:55:37         2   
4        T00363      C0070      P067  2024-03-21 15:10:10         3   

   TotalValue   Price  
0      300.68  300.68  
1      300.68  300.68  
2      300.68  300.68  
3      601.36  300.68  
4      902.04  300.68  


MERGING ALL DATASETS

In [6]:
transactions_products = transactions.merge(products, on='ProductID')
customer_transactions_products = transactions_products.merge(customers, on='CustomerID')
print(customer_transactions_products.head())
print(customer_transactions_products.shape)

  TransactionID CustomerID ProductID      TransactionDate  Quantity  \
0        T00001      C0199      P067  2024-08-25 12:38:23         1   
1        T00112      C0146      P067  2024-05-27 22:23:54         1   
2        T00166      C0127      P067  2024-04-25 07:38:55         1   
3        T00272      C0087      P067  2024-03-26 22:55:37         2   
4        T00363      C0070      P067  2024-03-21 15:10:10         3   

   TotalValue  Price_x                      ProductName     Category  Price_y  \
0      300.68   300.68  ComfortLiving Bluetooth Speaker  Electronics   300.68   
1      300.68   300.68  ComfortLiving Bluetooth Speaker  Electronics   300.68   
2      300.68   300.68  ComfortLiving Bluetooth Speaker  Electronics   300.68   
3      601.36   300.68  ComfortLiving Bluetooth Speaker  Electronics   300.68   
4      902.04   300.68  ComfortLiving Bluetooth Speaker  Electronics   300.68   

      CustomerName         Region  SignupDate  
0   Andrea Jenkins         Europe  202

In [7]:
print(customer_transactions_products.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000 entries, 0 to 999
Data columns (total 13 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   TransactionID    1000 non-null   object 
 1   CustomerID       1000 non-null   object 
 2   ProductID        1000 non-null   object 
 3   TransactionDate  1000 non-null   object 
 4   Quantity         1000 non-null   int64  
 5   TotalValue       1000 non-null   float64
 6   Price_x          1000 non-null   float64
 7   ProductName      1000 non-null   object 
 8   Category         1000 non-null   object 
 9   Price_y          1000 non-null   float64
 10  CustomerName     1000 non-null   object 
 11  Region           1000 non-null   object 
 12  SignupDate       1000 non-null   object 
dtypes: float64(3), int64(1), object(9)
memory usage: 101.7+ KB
None


FEATURE ENGINEERING 

EXTRACTING FEATURES LIKE Spending habits, Purchase behavior, Activity level and Profile information.

In [8]:
customer_features = customer_transactions_products.groupby('CustomerID').agg(
    total_spent=pd.NamedAgg(column='TotalValue', aggfunc='sum'),
    avg_transaction_value=pd.NamedAgg(column='TotalValue', aggfunc='mean'),
    transaction_count=pd.NamedAgg(column='TransactionID', aggfunc='count'),
    distinct_products=pd.NamedAgg(column='ProductID', aggfunc='nunique'),
    categories_bought=pd.NamedAgg(column='Category', aggfunc='nunique')
).reset_index()

In [9]:
customer_features = customer_features.merge(customers[['CustomerID', 'Region', 'SignupDate']], on='CustomerID')

In [10]:
customer_features['SignupDate'] = pd.to_datetime(customer_features['SignupDate'])
customer_features['days_since_signup'] = (pd.Timestamp.now() - customer_features['SignupDate']).dt.days
print(customer_features.head())

  CustomerID  total_spent  avg_transaction_value  transaction_count  \
0      C0001      3354.52                670.904                  5   
1      C0002      1862.74                465.685                  4   
2      C0003      2725.38                681.345                  4   
3      C0004      5354.88                669.360                  8   
4      C0005      2034.24                678.080                  3   

   distinct_products  categories_bought         Region SignupDate  \
0                  5                  3  South America 2022-07-10   
1                  4                  2           Asia 2022-02-13   
2                  4                  3  South America 2024-03-07   
3                  8                  3  South America 2022-10-09   
4                  3                  2           Asia 2022-08-15   

   days_since_signup  
0                931  
1               1078  
2                325  
3                840  
4                895  


In [11]:
customer_features.drop(columns=['SignupDate'], inplace=True)
print(customer_features.head())

  CustomerID  total_spent  avg_transaction_value  transaction_count  \
0      C0001      3354.52                670.904                  5   
1      C0002      1862.74                465.685                  4   
2      C0003      2725.38                681.345                  4   
3      C0004      5354.88                669.360                  8   
4      C0005      2034.24                678.080                  3   

   distinct_products  categories_bought         Region  days_since_signup  
0                  5                  3  South America                931  
1                  4                  2           Asia               1078  
2                  4                  3  South America                325  
3                  8                  3  South America                840  
4                  3                  2           Asia                895  


NORMALIZING FEATURES

In [12]:
!pip install scikit-learn




[notice] A new release of pip is available: 24.2 -> 24.3.1
[notice] To update, run: python.exe -m pip install --upgrade pip


In [13]:
from sklearn.preprocessing import MinMaxScaler
numerical_cols = ['total_spent', 'avg_transaction_value', 'transaction_count', 'distinct_products', 'categories_bought', 'days_since_signup']
scaler = MinMaxScaler()
customer_features[numerical_cols] = scaler.fit_transform(customer_features[numerical_cols])
print(customer_features.head())

  CustomerID  total_spent  avg_transaction_value  transaction_count  \
0      C0001     0.308942               0.474336                0.4   
1      C0002     0.168095               0.308940                0.3   
2      C0003     0.249541               0.482751                0.3   
3      C0004     0.497806               0.473092                0.7   
4      C0005     0.184287               0.480120                0.2   

   distinct_products  categories_bought         Region  days_since_signup  
0           0.444444           0.666667  South America           0.842204  
1           0.333333           0.333333           Asia           0.979458  
2           0.333333           0.666667  South America           0.276377  
3           0.777778           0.666667  South America           0.757236  
4           0.222222           0.333333           Asia           0.808590  


CALCULATING SIMILARITY SCORES

In [14]:
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np
feature_matrix = customer_features[numerical_cols].values
similarity_matrix = cosine_similarity(feature_matrix)
similarity_df = pd.DataFrame(similarity_matrix, index=customer_features['CustomerID'], columns=customer_features['CustomerID'])
print(similarity_df.head())

CustomerID     C0001     C0002     C0003     C0004     C0005     C0006  \
CustomerID                                                               
C0001       1.000000  0.946882  0.916777  0.958777  0.954676  0.888789   
C0002       0.946882  1.000000  0.742949  0.875756  0.973644  0.724595   
C0003       0.916777  0.742949  1.000000  0.910060  0.803829  0.974541   
C0004       0.958777  0.875756  0.910060  1.000000  0.862280  0.872022   
C0005       0.954676  0.973644  0.803829  0.862280  1.000000  0.822160   

CustomerID     C0007     C0008     C0009     C0010  ...     C0191     C0192  \
CustomerID                                          ...                       
C0001       0.937552  0.832701  0.986637  0.979774  ...  0.896276  0.991047   
C0002       0.949249  0.672084  0.966146  0.987797  ...  0.763949  0.945439   
C0003       0.803144  0.889938  0.864651  0.825901  ...  0.930660  0.899014   
C0004       0.844387  0.928800  0.930558  0.937100  ...  0.964941  0.923287   
C0005  

TOP 3 RECOMMENDATION GENERATION

In [15]:
def get_top_n_similar(similarity_df, n=3):
    recommendations = {}
    for cust_id in similarity_df.index:
        similar_customers = similarity_df.loc[cust_id].drop(cust_id).sort_values(ascending=False).head(n)
        recommendations[cust_id] = list(zip(similar_customers.index, similar_customers.values))
    return recommendations

FOR FIRST 20 CUSTOMERS (CustomerID: C0001 - C0020)

In [16]:
top_n_recommendations = get_top_n_similar(similarity_df.loc['C0001':'C0020'], n=3)
print(top_n_recommendations)

{'C0001': [('C0152', np.float64(0.9991413126016656)), ('C0106', np.float64(0.998440884683327)), ('C0174', np.float64(0.9972952693891033))], 'C0002': [('C0071', np.float64(0.9941366378317117)), ('C0134', np.float64(0.9920668372260568)), ('C0038', np.float64(0.9886562124462539))], 'C0003': [('C0170', np.float64(0.9971037326344968)), ('C0177', np.float64(0.9965184620552714)), ('C0178', np.float64(0.9954984703465475))], 'C0004': [('C0108', np.float64(0.9967662274809577)), ('C0175', np.float64(0.9932902636506894)), ('C0155', np.float64(0.9927794835384522))], 'C0005': [('C0159', np.float64(0.9999773928810057)), ('C0073', np.float64(0.9999652022442802)), ('C0112', np.float64(0.9993636535401836))], 'C0006': [('C0026', np.float64(0.9934115643934283)), ('C0185', np.float64(0.9873659889975546)), ('C0148', np.float64(0.9834797940131053))], 'C0007': [('C0176', np.float64(0.9961186063771026)), ('C0073', np.float64(0.9958664745081781)), ('C0159', np.float64(0.995769958178055))], 'C0008': [('C0047', n

SAVING OUTPUT TO CSV

In [20]:
output = []
for cust_id, lookalikes in top_n_recommendations.items():
    lookalike_list = [(str(other_cust_id), float(f"{score:.4f}")) for other_cust_id, score in lookalikes]
    output.append({'cust_id': cust_id, 'lookalikes': lookalike_list})
lookalike_df = pd.DataFrame(output)
lookalike_df.to_csv('Renu_Shree_Lookalike.csv', index=False)
print("Renu_Shree_Lookalike.csv saved successfully!")

Renu_Shree_Lookalike.csv saved successfully!
