In [6]:
# Import libraries
import pandas as pd
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.preprocessing import StandardScaler

# Load datasets
customers = pd.read_csv('/content/Customers.csv')
transactions = pd.read_csv('/content/Products.csv')
products = pd.read_csv('/content/Transactions.csv')

print("Transactions Columns:", transactions.columns.tolist())
print("Customers Columns:", customers.columns.tolist())
print("Products Columns:", products.columns.tolist())



Transactions Columns: ['ProductID', 'ProductName', 'Category', 'Price']
Customers Columns: ['CustomerID', 'CustomerName', 'Region', 'SignupDate']
Products Columns: ['TransactionID', 'CustomerID', 'ProductID', 'TransactionDate', 'Quantity', 'TotalValue', 'Price']


In [30]:
# Merge the datasets
transactions_with_products = transactions.merge(products[['ProductID', 'CustomerID', 'Quantity', 'TotalValue']],
                                                on='ProductID', how='inner')

# Now, merge this data with customers to add customer details based on CustomerID
final_data = transactions_with_products.merge(customers, on='CustomerID', how='inner')

# Check the result to verify that the Quantity and other columns are now available
final_data.head()


Unnamed: 0,ProductID,ProductName,Category,Price,CustomerID,Quantity,TotalValue,CustomerName,Region,SignupDate
0,P001,ActiveWear Biography,Books,169.3,C0019,2,338.6,Brandon Rodriguez,Europe,2023-01-12
1,P001,ActiveWear Biography,Books,169.3,C0024,4,677.2,Michele Cooley,North America,2024-02-05
2,P001,ActiveWear Biography,Books,169.3,C0071,2,338.6,Taylor Murphy,South America,2022-07-01
3,P001,ActiveWear Biography,Books,169.3,C0036,2,338.6,Brian Aguilar DDS,North America,2024-07-06
4,P001,ActiveWear Biography,Books,169.3,C0191,1,169.3,Samantha Gibson DVM,South America,2024-04-07


In [31]:
# Aggregate data by customer
customer_agg = final_data.groupby('CustomerID').agg({
    'TotalValue': 'sum',         # Total spend
    'Quantity': 'sum',           # Total quantity purchased
    'Category': lambda x: len(x.unique()),  # Number of unique categories purchased
    'ProductID': 'nunique',      # Number of unique products purchased
}).reset_index()

# Check the aggregated customer data
customer_agg.head()

Unnamed: 0,CustomerID,TotalValue,Quantity,Category,ProductID
0,C0001,3354.52,12,3,5
1,C0002,1862.74,10,2,4
2,C0003,2725.38,14,3,4
3,C0004,5354.88,23,3,8
4,C0005,2034.24,7,2,3


In [34]:
# Step 5: Feature Engineering for similarity calculation
# We will create a feature matrix using the aggregated data
X = customer_agg[['TotalValue', 'Quantity', 'Category', 'ProductID']]

# Checking the feature matrix
print(X.head())



   TotalValue  Quantity  Category  ProductID
0     3354.52        12         3          5
1     1862.74        10         2          4
2     2725.38        14         3          4
3     5354.88        23         3          8
4     2034.24         7         2          3


In [35]:
# Step 6: Calculate the cosine similarity between customers
# Cosine similarity is calculated based on the features in X
cos_sim = cosine_similarity(X)

# Check the shape of the cosine similarity matrix
print(cos_sim.shape)


(199, 199)


In [36]:
# Step 7: Generate recommendations for each customer (top 3 lookalikes)
lookalikes = {}

# Iterate through each customer and calculate similarity
for i, customer_id in enumerate(customer_agg['CustomerID']):
    # Exclude the customer itself from the lookalikes
    similarity_scores = list(enumerate(cos_sim[i]))
    similarity_scores = [score for score in similarity_scores if score[0] != i]

    # Sort the similarity scores and pick the top 3 lookalikes
    similarity_scores.sort(key=lambda x: x[1], reverse=True)
    top_3 = [(customer_agg['CustomerID'][score[0]], score[1]) for score in similarity_scores[:3]]

    # Add to the lookalikes dictionary
    lookalikes[customer_id] = top_3

# Checking the lookalikes for the first customer
print(lookalikes['C0001'])


[('C0093', 0.9999999900880265), ('C0131', 0.9999999892627299), ('C0005', 0.9999999866567792)]


In [38]:
# Step 8: Prepare the Lookalike.csv file
lookalike_df = []

# Create a list of lookalikes
for customer_id, lookalike_list in lookalikes.items():
    for lookalike in lookalike_list:
        lookalike_df.append([customer_id, lookalike[0], lookalike[1]])

# Convert the list to a DataFrame
lookalike_df = pd.DataFrame(lookalike_df, columns=['CustomerID', 'LookalikeCustomerID', 'SimilarityScore'])

# Save the DataFrame to a CSV file
lookalike_df.to_csv('/content/drive/MyDrive/Colab Notebooks/Lookalike.csv', index=False)

# Final Output and Verification
print("Lookalike recommendations saved to 'Lookalike.csv'.")
lookalike_df.head()


Lookalike recommendations saved to 'Lookalike.csv'.


Unnamed: 0,CustomerID,LookalikeCustomerID,SimilarityScore
0,C0001,C0093,1.0
1,C0001,C0131,1.0
2,C0001,C0005,1.0
3,C0002,C0034,1.0
4,C0002,C0030,1.0
