In [1]:
import pandas as pd

# Load datasets
customers = pd.read_csv('Customers.csv')
products = pd.read_csv('Products.csv')
transactions = pd.read_csv('Transactions.csv')

# Check the first few rows of the datasets to understand their structure
print(customers.head())
print(products.head())
print(transactions.head())


  CustomerID        CustomerName         Region  SignupDate
0      C0001    Lawrence Carroll  South America  2022-07-10
1      C0002      Elizabeth Lutz           Asia  2022-02-13
2      C0003      Michael Rivera  South America  2024-03-07
3      C0004  Kathleen Rodriguez  South America  2022-10-09
4      C0005         Laura Weber           Asia  2022-08-15
  ProductID              ProductName     Category   Price
0      P001     ActiveWear Biography        Books  169.30
1      P002    ActiveWear Smartwatch  Electronics  346.30
2      P003  ComfortLiving Biography        Books   44.12
3      P004            BookWorld Rug   Home Decor   95.69
4      P005          TechPro T-Shirt     Clothing  429.31
  TransactionID CustomerID ProductID      TransactionDate  Quantity  \
0        T00001      C0199      P067  2024-08-25 12:38:23         1   
1        T00112      C0146      P067  2024-05-27 22:23:54         1   
2        T00166      C0127      P067  2024-04-25 07:38:55         1   
3       

In [2]:
# Merge customer and transaction data
merged_data = pd.merge(transactions, customers, on='CustomerID', how='left')

# Feature engineering: Aggregating customer-level transaction features
customer_features = merged_data.groupby('CustomerID').agg(
    total_spending=('TotalValue', 'sum'),
    purchase_frequency=('TransactionID', 'count'),
    avg_purchase_value=('TotalValue', 'mean')
).reset_index()

# Merge customer profile information (e.g., region, signup date)
customer_features = pd.merge(customer_features, customers[['CustomerID', 'Region']], on='CustomerID', how='left')

# Normalize numerical features (standardize the data)
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
numerical_features = ['total_spending', 'purchase_frequency', 'avg_purchase_value']
customer_features[numerical_features] = scaler.fit_transform(customer_features[numerical_features])

print(customer_features.head())


  CustomerID  total_spending  purchase_frequency  avg_purchase_value  \
0      C0001       -0.061701           -0.011458           -0.070263   
1      C0002       -0.877744           -0.467494           -0.934933   
2      C0003       -0.405857           -0.467494           -0.026271   
3      C0004        1.032547            1.356650           -0.076769   
4      C0005       -0.783929           -0.923530           -0.040028   

          Region  
0  South America  
1           Asia  
2  South America  
3  South America  
4           Asia  


In [3]:
from sklearn.metrics.pairwise import cosine_similarity

# Calculate cosine similarity between customers based on the selected features
similarity_matrix = cosine_similarity(customer_features[numerical_features])

# Display similarity matrix
print(similarity_matrix)


[[ 1.          0.97368428  0.55235809 ...  0.81778127  0.96329967
  -0.97957178]
 [ 0.97368428  1.          0.70863336 ...  0.9245182   0.99886722
  -0.91989556]
 [ 0.55235809  0.70863336  1.         ...  0.917472    0.74140513
  -0.38020974]
 ...
 [ 0.81778127  0.9245182   0.917472   ...  1.          0.94116243
  -0.70106232]
 [ 0.96329967  0.99886722  0.74140513 ...  0.94116243  1.
  -0.9005327 ]
 [-0.97957178 -0.91989556 -0.38020974 ... -0.70106232 -0.9005327
   1.        ]]


In [4]:
lookalikes = {}

# For each of the first 20 customers (CustomerID: C0001 - C0020)
for customer_id in customer_features['CustomerID'][:20]:
    # Get the index of the current customer
    idx = customer_features[customer_features['CustomerID'] == customer_id].index[0]
    
    # Get the similarity scores for this customer with all others
    similarity_scores = list(enumerate(similarity_matrix[idx]))
    
    # Sort customers by similarity score and get the top 3 (excluding the current customer)
    similarity_scores = sorted(similarity_scores, key=lambda x: x[1], reverse=True)
    top_3_lookalikes = [(customer_features.iloc[i[0]]['CustomerID'], i[1]) for i in similarity_scores[1:4]]
    
    # Store the results
    lookalikes[customer_id] = top_3_lookalikes

# Print lookalikes for the first customer
print(lookalikes['C0001'])


[('C0137', 0.9993600788417096), ('C0152', 0.9956575062125335), ('C0121', 0.9930123335059389)]


In [5]:
# Prepare data to save to CSV
lookalike_data = []
for customer_id, recommendations in lookalikes.items():
    row = [customer_id]
    for rec in recommendations:
        row.append(rec[0])  # Lookalike ID
        row.append(rec[1])  # Similarity Score
    lookalike_data.append(row)

# Create a DataFrame and save it to CSV
columns = ['CustomerID', 'Lookalike_1', 'Score_1', 'Lookalike_2', 'Score_2', 'Lookalike_3', 'Score_3']
lookalike_df = pd.DataFrame(lookalike_data, columns=columns)
lookalike_df.to_csv('Lookalike.csv', index=False)

# Display the saved CSV file
print(lookalike_df.head())


  CustomerID Lookalike_1   Score_1 Lookalike_2   Score_2 Lookalike_3   Score_3
0      C0001       C0137  0.999360       C0152  0.995658       C0121  0.993012
1      C0002       C0029  0.999638       C0199  0.998867       C0010  0.998831
2      C0003       C0005  0.999894       C0178  0.999565       C0144  0.999217
3      C0004       C0067  0.999991       C0021  0.999658       C0075  0.999288
4      C0005       C0003  0.999894       C0073  0.999495       C0063  0.999259
