In [3]:
import pandas as pd

# Load datasets
customers = pd.read_csv('Customers.csv')
transactions = pd.read_csv('Transactions.csv')
products = pd.read_csv('Products.csv')

# Merge transactions with product information
transactions = transactions.merge(products, on='ProductID', how='inner')

# Merge customer information
data = transactions.merge(customers, on='CustomerID', how='inner')

# Preview the combined data
print(data.head())


  TransactionID CustomerID ProductID      TransactionDate  Quantity  \
0        T00001      C0199      P067  2024-08-25 12:38:23         1   
1        T00112      C0146      P067  2024-05-27 22:23:54         1   
2        T00166      C0127      P067  2024-04-25 07:38:55         1   
3        T00272      C0087      P067  2024-03-26 22:55:37         2   
4        T00363      C0070      P067  2024-03-21 15:10:10         3   

   TotalValue  Price_x                      ProductName     Category  Price_y  \
0      300.68   300.68  ComfortLiving Bluetooth Speaker  Electronics   300.68   
1      300.68   300.68  ComfortLiving Bluetooth Speaker  Electronics   300.68   
2      300.68   300.68  ComfortLiving Bluetooth Speaker  Electronics   300.68   
3      601.36   300.68  ComfortLiving Bluetooth Speaker  Electronics   300.68   
4      902.04   300.68  ComfortLiving Bluetooth Speaker  Electronics   300.68   

      CustomerName         Region  SignupDate  
0   Andrea Jenkins         Europe  03-

In [5]:
import pandas as pd
from sklearn.preprocessing import MinMaxScaler
from sklearn.metrics.pairwise import cosine_similarity

# Convert dates to datetime
customers['SignupDate'] = pd.to_datetime(customers['SignupDate'],format='%d-%m-%Y')
transactions['TransactionDate'] = pd.to_datetime(transactions['TransactionDate'])

# Aggregate transaction data
transaction_features = transactions.groupby('CustomerID').agg({
    'TotalValue': ['sum', 'mean'],
    'ProductID': 'nunique', 
    'Category': lambda x: x.mode()[0]  
}).reset_index()

# Rename columns
transaction_features.columns = ['CustomerID', 'TotalSpending', 'AvgTransactionValue', 'UniqueProducts', 'TopCategory']

# Encode TopCategory
transaction_features = pd.get_dummies(transaction_features, columns=['TopCategory'], prefix='Category')

# Merge with customer data
customer_data = customers.merge(transaction_features, on='CustomerID', how='left')

# Select features for similarity calculation
features = customer_data.drop(['CustomerID', 'SignupDate'], axis=1)

# Handle missing values by filling them with 0
features = features.fillna(0)

# Convert non-numeric columns to numeric 
features = features.apply(pd.to_numeric, errors='coerce').fillna(0)

# Normalize features
scaler = MinMaxScaler()
normalized_features = scaler.fit_transform(features)

# Compute cosine similarity
similarity_matrix = cosine_similarity(normalized_features)

# Convert similarity matrix to a DataFrame
similarity_df = pd.DataFrame(similarity_matrix, index=customer_data['CustomerID'], columns=customer_data['CustomerID'])

# Summary
print(similarity_df.head())


CustomerID     C0001     C0002     C0003     C0004     C0005     C0006  \
CustomerID                                                               
C0001       1.000000  0.298255  0.349927  0.438417  0.984110  0.411723   
C0002       0.298255  1.000000  0.275652  0.348556  0.246927  0.318325   
C0003       0.349927  0.275652  1.000000  0.396076  0.300648  0.393963   
C0004       0.438417  0.348556  0.396076  1.000000  0.344429  0.938406   
C0005       0.984110  0.246927  0.300648  0.344429  1.000000  0.367057   

CustomerID     C0007     C0008     C0009     C0010  ...     C0191     C0192  \
CustomerID                                          ...                       
C0001       0.979384  0.414004  0.214254  0.288285  ...  0.361895  0.990921   
C0002       0.272068  0.338211  0.988409  0.999680  ...  0.287675  0.248935   
C0003       0.337496  0.908210  0.197045  0.265422  ...  0.334009  0.289815   
C0004       0.375696  0.527879  0.252444  0.339231  ...  0.975185  0.361359   
C0005  

In [7]:
import pandas as pd
from sklearn.preprocessing import MinMaxScaler
from sklearn.metrics.pairwise import cosine_similarity

# Ensure the customer IDs in similarity_df and customer_data match types (string) 
similarity_df.index = similarity_df.index.astype(str)  
similarity_df.columns = similarity_df.columns.astype(str)  
customer_data['CustomerID'] = customer_data['CustomerID'].astype(str)  
# Debugging: Check the data types
print("Data type of similarity_df index:", similarity_df.index.dtype)
print("Data type of customer_data['CustomerID']:", customer_data['CustomerID'].dtype)

# Strip spaces from customer IDs  for correct comparison 
similarity_df.index = similarity_df.index.str.strip()  
customer_data['CustomerID'] = customer_data['CustomerID'].str.strip()  

# Debugging: Check for missing customers in similarity_df
missing_customers = [customer for customer in customer_data['CustomerID'][:20] if customer not in similarity_df.index]

# Print missing customers 
if missing_customers:
    print("Missing Customers from similarity_df index:")
    for customer in missing_customers:
        print(f"Customer {customer} not found in similarity_df index")
else:
    print("No missing customers found.")

# Initialize the lookalike map
lookalike_map = {}

# Iterate over the first 20 customers
for customer_id in customer_data['CustomerID'][:20]:
    if customer_id in similarity_df.index:  # Check if customer_id exists in similarity_df
        # Get similarity scores for this customer
        scores = similarity_df.loc[customer_id].sort_values(ascending=False)

        # Exclude the customer itself 
        top_3 = scores.iloc[1:4]  # Take the next 3 highest scores
        lookalike_map[customer_id] = [(other_id, round(score, 2)) for other_id, score in top_3.items()]
    else:
        print(f"Customer ID {customer_id} not found in similarity matrix. Skipping...")

# Convert into a DataFrame
lookalike_df = pd.DataFrame({'CustomerID': lookalike_map.keys(), 'Lookalikes': lookalike_map.values()})

# Save as CSV file
lookalike_df.to_csv('Lookalike.csv', index=False)




Data type of similarity_df index: object
Data type of customer_data['CustomerID']: object
No missing customers found.
