In [65]:
# Importing library
import pandas as pd

In [66]:
# Loading the datasets from local files
customers = pd.read_csv(r"C:\Users\Windows 10\Downloads\Customers.csv")
products = pd.read_csv(r"C:\Users\Windows 10\Downloads\Products.csv")
transactions = pd.read_csv(r"C:\Users\Windows 10\Downloads\Transactions.csv")

In [67]:
# Displaying the first few rows of each dataset to verify the data has loaded correctly
print(customers.head())
print(products.head())
print(transactions.head())

  CustomerID        CustomerName         Region  SignupDate
0      C0001    Lawrence Carroll  South America  2022-07-10
1      C0002      Elizabeth Lutz           Asia  2022-02-13
2      C0003      Michael Rivera  South America  2024-03-07
3      C0004  Kathleen Rodriguez  South America  2022-10-09
4      C0005         Laura Weber           Asia  2022-08-15
  ProductID              ProductName     Category   Price
0      P001     ActiveWear Biography        Books  169.30
1      P002    ActiveWear Smartwatch  Electronics  346.30
2      P003  ComfortLiving Biography        Books   44.12
3      P004            BookWorld Rug   Home Decor   95.69
4      P005          TechPro T-Shirt     Clothing  429.31
  TransactionID CustomerID ProductID      TransactionDate  Quantity  \
0        T00001      C0199      P067  2024-08-25 12:38:23         1   
1        T00112      C0146      P067  2024-05-27 22:23:54         1   
2        T00166      C0127      P067  2024-04-25 07:38:55         1   
3       

In [68]:
# Merging Transactions with Products on ProductID
transactions_products = transactions.merge(products, on='ProductID', how='left')

# Preview the merged data
print(transactions_products.head())

  TransactionID CustomerID ProductID      TransactionDate  Quantity  \
0        T00001      C0199      P067  2024-08-25 12:38:23         1   
1        T00112      C0146      P067  2024-05-27 22:23:54         1   
2        T00166      C0127      P067  2024-04-25 07:38:55         1   
3        T00272      C0087      P067  2024-03-26 22:55:37         2   
4        T00363      C0070      P067  2024-03-21 15:10:10         3   

   TotalValue  Price_x                      ProductName     Category  Price_y  
0      300.68   300.68  ComfortLiving Bluetooth Speaker  Electronics   300.68  
1      300.68   300.68  ComfortLiving Bluetooth Speaker  Electronics   300.68  
2      300.68   300.68  ComfortLiving Bluetooth Speaker  Electronics   300.68  
3      601.36   300.68  ComfortLiving Bluetooth Speaker  Electronics   300.68  
4      902.04   300.68  ComfortLiving Bluetooth Speaker  Electronics   300.68  


In [69]:
# Merging the transactions and products with Customers on CustomerID
merged_data = transactions_products.merge(customers, on='CustomerID', how='left')

# Preview the merged dataset
print(merged_data.head())

  TransactionID CustomerID ProductID      TransactionDate  Quantity  \
0        T00001      C0199      P067  2024-08-25 12:38:23         1   
1        T00112      C0146      P067  2024-05-27 22:23:54         1   
2        T00166      C0127      P067  2024-04-25 07:38:55         1   
3        T00272      C0087      P067  2024-03-26 22:55:37         2   
4        T00363      C0070      P067  2024-03-21 15:10:10         3   

   TotalValue  Price_x                      ProductName     Category  Price_y  \
0      300.68   300.68  ComfortLiving Bluetooth Speaker  Electronics   300.68   
1      300.68   300.68  ComfortLiving Bluetooth Speaker  Electronics   300.68   
2      300.68   300.68  ComfortLiving Bluetooth Speaker  Electronics   300.68   
3      601.36   300.68  ComfortLiving Bluetooth Speaker  Electronics   300.68   
4      902.04   300.68  ComfortLiving Bluetooth Speaker  Electronics   300.68   

      CustomerName         Region  SignupDate  
0   Andrea Jenkins         Europe  202

In [70]:
# Aggregate customer features
customer_features = merged_data.groupby('CustomerID').agg({
    'TotalValue': 'sum',               # Total spending
    'TransactionID': 'count',          # Number of transactions
    'Category': lambda x: x.mode()[0]  # Most frequent product category
}).reset_index()

# Renaming columns for clarity
customer_features.columns = ['CustomerID', 'TotalSpending', 'TransactionCount', 'PreferredCategory']

# Preview the aggregated features
print(customer_features.head())


  CustomerID  TotalSpending  TransactionCount PreferredCategory
0      C0001        3354.52                 5       Electronics
1      C0002        1862.74                 4          Clothing
2      C0003        2725.38                 4        Home Decor
3      C0004        5354.88                 8             Books
4      C0005        2034.24                 3       Electronics


In [71]:
# Encoding PreferredCategory
customer_features['PreferredCategory'] = customer_features['PreferredCategory'].astype('category').cat.codes

# Preview the dataset
print(customer_features.head())


  CustomerID  TotalSpending  TransactionCount  PreferredCategory
0      C0001        3354.52                 5                  2
1      C0002        1862.74                 4                  1
2      C0003        2725.38                 4                  3
3      C0004        5354.88                 8                  0
4      C0005        2034.24                 3                  2


In [72]:
from sklearn.preprocessing import StandardScaler

# Initialize the scaler
scaler = StandardScaler()

# Normalize the numerical features
customer_features[['TotalSpending', 'TransactionCount']] = scaler.fit_transform(
    customer_features[['TotalSpending', 'TransactionCount']]
)

# Preview the normalized dataset
print(customer_features.head())


  CustomerID  TotalSpending  TransactionCount  PreferredCategory
0      C0001      -0.061701         -0.011458                  2
1      C0002      -0.877744         -0.467494                  1
2      C0003      -0.405857         -0.467494                  3
3      C0004       1.032547          1.356650                  0
4      C0005      -0.783929         -0.923530                  2


In [77]:
feature_matrix = customer_features[['TotalSpending', 'TransactionCount', 'PreferredCategory']]

In [78]:
from sklearn.metrics.pairwise import cosine_similarity
import pandas as pd

# Compute cosine similarity
similarity_matrix = cosine_similarity(feature_matrix)

# Convert similarity matrix to a DataFrame for easier interpretation
similarity_df = pd.DataFrame(
    similarity_matrix, 
    index=customer_features['CustomerID'], 
    columns=customer_features['CustomerID']
)

# Preview the similarity matrix
print(similarity_df.head())


CustomerID     C0001     C0002     C0003     C0004     C0005     C0006  \
CustomerID                                                               
C0001       1.000000  0.729803  0.983842 -0.023232  0.867516 -0.016217   
C0002       0.729803  1.000000  0.827480 -0.640708  0.946070 -0.166000   
C0003       0.983842  0.827480  1.000000 -0.201686  0.942385  0.025963   
C0004      -0.023232 -0.640708 -0.201686  1.000000 -0.517337 -0.191994   
C0005       0.867516  0.946070  0.942385 -0.517337  1.000000  0.072263   

CustomerID     C0007     C0008     C0009     C0010  ...     C0191     C0192  \
CustomerID                                          ...                       
C0001       0.895151  0.784800  0.535544  0.706127  ...  0.031060  0.924329   
C0002       0.898314  0.290823  0.965896  0.999094  ...  0.636535  0.934676   
C0003       0.959306  0.669030  0.667608  0.805938  ...  0.139167  0.972679   
C0004      -0.456122  0.547089 -0.810652 -0.651373  ... -0.640514 -0.380661   
C0005  

In [79]:
# Define a dictionary to store lookalike results
lookalike_results = {}

# Extract top 3 similar customers for the first 20 customers
for customer_id in customer_features['CustomerID'][:20]:
    # Sort similar customers in descending order (exclude self)
    similar_customers = similarity_df[customer_id].sort_values(ascending=False).iloc[1:4]
    
    # Store the results in the dictionary
    lookalike_results[customer_id] = [(index, round(score, 2)) for index, score in similar_customers.items()]

# Preview the lookalike results
print(lookalike_results)


{'C0001': [('C0152', 1.0), ('C0164', 1.0), ('C0160', 1.0)], 'C0002': [('C0029', 1.0), ('C0010', 1.0), ('C0060', 0.98)], 'C0003': [('C0178', 1.0), ('C0052', 1.0), ('C0166', 1.0)], 'C0004': [('C0021', 1.0), ('C0075', 1.0), ('C0156', 1.0)], 'C0005': [('C0112', 1.0), ('C0197', 1.0), ('C0186', 1.0)], 'C0006': [('C0117', 0.99), ('C0168', 0.94), ('C0185', 0.73)], 'C0007': [('C0092', 1.0), ('C0120', 1.0), ('C0133', 1.0)], 'C0008': [('C0084', 0.99), ('C0162', 0.98), ('C0113', 0.97)], 'C0009': [('C0077', 1.0), ('C0083', 1.0), ('C0062', 0.99)], 'C0010': [('C0029', 1.0), ('C0002', 1.0), ('C0009', 0.97)], 'C0011': [('C0064', 1.0), ('C0187', 1.0), ('C0018', 1.0)], 'C0012': [('C0105', 1.0), ('C0039', 0.99), ('C0067', 0.99)], 'C0013': [('C0143', 1.0), ('C0054', 1.0), ('C0099', 0.99)], 'C0014': [('C0151', 1.0), ('C0097', 1.0), ('C0060', 1.0)], 'C0015': [('C0131', 1.0), ('C0036', 1.0), ('C0132', 1.0)], 'C0016': [('C0183', 1.0), ('C0170', 1.0), ('C0182', 1.0)], 'C0017': [('C0090', 1.0), ('C0175', 0.99), 

In [80]:
# Convert results to a DataFrame for saving
lookalike_csv = pd.DataFrame({
    'cust_id': lookalike_results.keys(),
    'similar_customers': [str(value) for value in lookalike_results.values()]
})

# Save to CSV
lookalike_csv.to_csv('Lookalike.csv', index=False)

# Preview the saved file
print(lookalike_csv.head())

  cust_id                                  similar_customers
0   C0001   [('C0152', 1.0), ('C0164', 1.0), ('C0160', 1.0)]
1   C0002  [('C0029', 1.0), ('C0010', 1.0), ('C0060', 0.98)]
2   C0003   [('C0178', 1.0), ('C0052', 1.0), ('C0166', 1.0)]
3   C0004   [('C0021', 1.0), ('C0075', 1.0), ('C0156', 1.0)]
4   C0005   [('C0112', 1.0), ('C0197', 1.0), ('C0186', 1.0)]
