In [1]:
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.preprocessing import StandardScaler

In [3]:
import pandas as pd

# Load datasets
customers = pd.read_csv("Customers.csv")
products = pd.read_csv("Products.csv")
transactions = pd.read_csv("Transactions.csv")

print("Datasets loaded successfully.")


Datasets loaded successfully.


In [4]:
print("Customers Dataset:\n", customers.head())
print("\nProducts Dataset:\n", products.head())
print("\nTransactions Dataset:\n", transactions.head())


Customers Dataset:
   CustomerID        CustomerName         Region  SignupDate
0      C0001    Lawrence Carroll  South America  2022-07-10
1      C0002      Elizabeth Lutz           Asia  2022-02-13
2      C0003      Michael Rivera  South America  2024-03-07
3      C0004  Kathleen Rodriguez  South America  2022-10-09
4      C0005         Laura Weber           Asia  2022-08-15

Products Dataset:
   ProductID              ProductName     Category   Price
0      P001     ActiveWear Biography        Books  169.30
1      P002    ActiveWear Smartwatch  Electronics  346.30
2      P003  ComfortLiving Biography        Books   44.12
3      P004            BookWorld Rug   Home Decor   95.69
4      P005          TechPro T-Shirt     Clothing  429.31

Transactions Dataset:
   TransactionID CustomerID ProductID      TransactionDate  Quantity  \
0        T00001      C0199      P067  2024-08-25 12:38:23         1   
1        T00112      C0146      P067  2024-05-27 22:23:54         1   
2        T00166

In [7]:
# Merge datasets
combined_data = transactions.merge(customers, on="CustomerID", how="left")
combined_data = combined_data.merge(products, on="ProductID", how="left")


# Check combined data
print(combined_data.head())


  TransactionID CustomerID ProductID      TransactionDate  Quantity  \
0        T00001      C0199      P067  2024-08-25 12:38:23         1   
1        T00112      C0146      P067  2024-05-27 22:23:54         1   
2        T00166      C0127      P067  2024-04-25 07:38:55         1   
3        T00272      C0087      P067  2024-03-26 22:55:37         2   
4        T00363      C0070      P067  2024-03-21 15:10:10         3   

   TotalValue  Price_x     CustomerName         Region  SignupDate  \
0      300.68   300.68   Andrea Jenkins         Europe  2022-12-03   
1      300.68   300.68  Brittany Harvey           Asia  2024-09-04   
2      300.68   300.68  Kathryn Stevens         Europe  2024-04-04   
3      601.36   300.68  Travis Campbell  South America  2024-04-11   
4      902.04   300.68    Timothy Perez         Europe  2022-03-15   

                       ProductName     Category  Price_y  
0  ComfortLiving Bluetooth Speaker  Electronics   300.68  
1  ComfortLiving Bluetooth Speaker

In [8]:
# Create customer-product matrix
customer_product_matrix = combined_data.pivot_table(
    index="CustomerID",    # Rows are customers
    columns="ProductID",   # Columns are products
    values="Quantity",     # Values are the quantities purchased
    fill_value=0           # Replace NaNs with 0
)

# Check the matrix structure
print("Customer-Product Matrix:\n", customer_product_matrix.head())



Customer-Product Matrix:
 ProductID   P001  P002  P003  P004  P005  P006  P007  P008  P009  P010  ...  \
CustomerID                                                              ...   
C0001        0.0   0.0   0.0   0.0   0.0   0.0   0.0   0.0   0.0   0.0  ...   
C0002        0.0   0.0   0.0   4.0   0.0   0.0   0.0   0.0   0.0   0.0  ...   
C0003        0.0   4.0   0.0   0.0   0.0   3.0   0.0   0.0   0.0   0.0  ...   
C0004        0.0   0.0   0.0   0.0   0.0   0.0   0.0   2.0   0.0   0.0  ...   
C0005        0.0   0.0   0.0   0.0   0.0   0.0   0.0   0.0   0.0   0.0  ...   

ProductID   P091  P092  P093  P094  P095  P096  P097  P098  P099  P100  
CustomerID                                                              
C0001        0.0   0.0   0.0   0.0   0.0   2.0   0.0   0.0   0.0   0.0  
C0002        0.0   0.0   0.0   0.0   2.0   0.0   0.0   0.0   0.0   0.0  
C0003        0.0   0.0   0.0   0.0   0.0   0.0   0.0   0.0   0.0   0.0  
C0004        0.0   0.0   0.0   0.0   0.0   0.0   3.0   

In [12]:
print("Customer-Product Matrix Shape:", customer_product_matrix.shape)
print("Customer-Product Matrix:\n", customer_product_matrix.head())


Customer-Product Matrix Shape: (199, 100)
Customer-Product Matrix:
 ProductID   P001  P002  P003  P004  P005  P006  P007  P008  P009  P010  ...  \
CustomerID                                                              ...   
C0001        0.0   0.0   0.0   0.0   0.0   0.0   0.0   0.0   0.0   0.0  ...   
C0002        0.0   0.0   0.0   4.0   0.0   0.0   0.0   0.0   0.0   0.0  ...   
C0003        0.0   4.0   0.0   0.0   0.0   3.0   0.0   0.0   0.0   0.0  ...   
C0004        0.0   0.0   0.0   0.0   0.0   0.0   0.0   2.0   0.0   0.0  ...   
C0005        0.0   0.0   0.0   0.0   0.0   0.0   0.0   0.0   0.0   0.0  ...   

ProductID   P091  P092  P093  P094  P095  P096  P097  P098  P099  P100  
CustomerID                                                              
C0001        0.0   0.0   0.0   0.0   0.0   2.0   0.0   0.0   0.0   0.0  
C0002        0.0   0.0   0.0   0.0   2.0   0.0   0.0   0.0   0.0   0.0  
C0003        0.0   0.0   0.0   0.0   0.0   0.0   0.0   0.0   0.0   0.0  
C0004        

In [13]:

# Scale the data
scaler = StandardScaler()
scaled_matrix = scaler.fit_transform(customer_product_matrix)
print("Scaled Matrix Shape:", scaled_matrix.shape)
print("First 5 Rows of Scaled Matrix:\n", scaled_matrix[:5])


Scaled Matrix Shape: (199, 100)
First 5 Rows of Scaled Matrix:
 [[-0.19155441 -0.19831279 -0.23403059 -0.17304493 -0.20175905 -0.24614845
  -0.18947347 -0.22087051 -0.17279088 -0.22536027 -0.21116754 -0.17754277
  -0.2090605  -0.14175444 -0.16594796 -0.18972568 -0.20788694 -0.17452778
  -0.22952917 -0.22554673 -0.21837601  3.87187453 -0.18094262 -0.14746422
  -0.23276104 -0.16397427 -0.23346127 -0.24314056  3.44757677 -0.18972568
  -0.12865022 -0.2211743  -0.19875551 -0.19259753 -0.20529159 -0.22248457
  -0.17279088 -0.22758963 -0.23291368 -0.24614845 -0.21818701 -0.18787481
  -0.22068712 -0.09662523 -0.19155441 -0.18697859 -0.21488208 -0.26361028
  -0.24723437 -0.23382097 -0.21338733 -0.15333663 -0.20347407  2.19863054
  -0.2076852  -0.18495205 -0.25147372 -0.22758963 -0.27481335 -0.14919877
  -0.26107569 -0.2693437  -0.20347407 -0.20326144 -0.22532583 -0.21085105
  -0.24148109 -0.21345765 -0.22918549 -0.15700137 -0.18617746 -0.17279088
  -0.21085105 -0.23399165 -0.21934876 -0.1952870

In [14]:
# Calculate cosine similarity
similarity_matrix = cosine_similarity(scaled_matrix)
similarity_df = pd.DataFrame(
    similarity_matrix, index=customer_product_matrix.index, columns=customer_product_matrix.index
)
print("Similarity Matrix Shape:", similarity_matrix.shape)
print("First 5 Rows of Similarity Matrix:\n", similarity_matrix[:5])



Similarity Matrix Shape: (199, 199)
First 5 Rows of Similarity Matrix:
 [[ 1.00000000e+00 -4.81018799e-02 -6.07385447e-02 -7.90947486e-02
  -5.07741700e-02 -6.36003278e-02  1.54961977e-01 -7.14507466e-02
  -4.54132531e-02 -5.29948026e-02 -5.75847884e-02 -7.11710269e-02
  -7.16431273e-02 -2.76626810e-02 -3.38836786e-02  2.63538469e-02
   1.61143109e-02 -6.35279560e-02  9.86329615e-02  3.65399021e-01
   4.09574544e-02 -7.46171595e-02  1.18484048e-01 -6.57580834e-02
   5.52557227e-02  2.21118307e-01  8.06291600e-02 -6.03589646e-02
  -5.55912256e-02 -5.88252999e-02 -5.09409881e-02 -4.30427295e-02
  -2.97212194e-02 -6.74144269e-02  2.33302140e-01 -4.80641661e-02
  -6.46440183e-02  6.89438602e-02  3.34925618e-02 -3.98367431e-02
  -7.53604692e-02 -4.08007762e-02 -4.67914364e-02  2.74116808e-02
   4.86966053e-02  4.64848218e-02  7.52257907e-02 -7.24608353e-02
  -7.50229244e-02  1.90218073e-01  1.84761255e-01 -4.75221997e-02
  -6.16558131e-02 -7.64172321e-02 -6.64093664e-02 -5.43415931e-02
  -6

In [15]:
print("Similarity DataFrame Shape:", similarity_df.shape)
print("Similarity DataFrame:\n", similarity_df.head())



Similarity DataFrame Shape: (199, 199)
Similarity DataFrame:
 CustomerID     C0001     C0002     C0003     C0004     C0005     C0006  \
CustomerID                                                               
C0001       1.000000 -0.048102 -0.060739 -0.079095 -0.050774 -0.063600   
C0002      -0.048102  1.000000 -0.035104 -0.051943 -0.022193 -0.034121   
C0003      -0.060739 -0.035104  1.000000  0.038353  0.244860 -0.046033   
C0004      -0.079095 -0.051943  0.038353  1.000000  0.076677 -0.064037   
C0005      -0.050774 -0.022193  0.244860  0.076677  1.000000 -0.033784   

CustomerID     C0007     C0008     C0009     C0010  ...     C0191     C0192  \
CustomerID                                          ...                       
C0001       0.154962 -0.071451 -0.045413 -0.052995  ...  0.021034 -0.055930   
C0002      -0.021802  0.240156 -0.000480 -0.029831  ... -0.037158 -0.025582   
C0003      -0.032847  0.229240 -0.037276 -0.038099  ... -0.048855 -0.041297   
C0004      -0.050155 -0.

In [16]:
print("Similarity Matrix:\n", similarity_df.head())

Similarity Matrix:
 CustomerID     C0001     C0002     C0003     C0004     C0005     C0006  \
CustomerID                                                               
C0001       1.000000 -0.048102 -0.060739 -0.079095 -0.050774 -0.063600   
C0002      -0.048102  1.000000 -0.035104 -0.051943 -0.022193 -0.034121   
C0003      -0.060739 -0.035104  1.000000  0.038353  0.244860 -0.046033   
C0004      -0.079095 -0.051943  0.038353  1.000000  0.076677 -0.064037   
C0005      -0.050774 -0.022193  0.244860  0.076677  1.000000 -0.033784   

CustomerID     C0007     C0008     C0009     C0010  ...     C0191     C0192  \
CustomerID                                          ...                       
C0001       0.154962 -0.071451 -0.045413 -0.052995  ...  0.021034 -0.055930   
C0002      -0.021802  0.240156 -0.000480 -0.029831  ... -0.037158 -0.025582   
C0003      -0.032847  0.229240 -0.037276 -0.038099  ... -0.048855 -0.041297   
C0004      -0.050155 -0.014801 -0.099439  0.059995  ... -0.066305 

In [17]:
# Generate lookalike recommendations for the first 20 customers
lookalike_results = {}
for customer_id in customers["CustomerID"][:20]:
    if customer_id in similarity_df.index:  # Ensure the customer exists in the matrix
        similar_customers = similarity_df[customer_id].sort_values(ascending=False).iloc[1:4]
        lookalike_results[customer_id] = [
            (sim_customer, round(score, 2)) for sim_customer, score in similar_customers.items()
        ]
    else:
        lookalike_results[customer_id] = []

In [18]:
# Save results to a DataFrame
lookalike_df = pd.DataFrame([
    {"CustomerID": key, "SimilarCustomers": value} for key, value in lookalike_results.items()
])

# Save to CSV
lookalike_df.to_csv("Shreya_Talekar_Lookalike.csv", index=False)

print("Lookalike results saved to 'Shreya_Talekar_Lookalike.csv'.")


Lookalike results saved to 'Shreya_Talekar_Lookalike.csv'.
