In [1]:
# Import necessary libraries
import pandas as pd
from sklearn.preprocessing import StandardScaler
from sklearn.metrics.pairwise import cosine_similarity

In [3]:
customers = pd.read_csv("Customers.csv")
products = pd.read_csv("Products.csv")
transactions = pd.read_csv("Transactions.csv")

In [4]:
# Merge datasets
merged_data = transactions.merge(customers, on="CustomerID").merge(products, on="ProductID")

In [5]:
# Feature Engineering
# Aggregate transaction data for each customer
customer_features = merged_data.groupby("CustomerID").agg(
    TotalSpend=("TotalValue", "sum"),
    TotalQuantity=("Quantity", "sum"),
    UniqueProducts=("ProductID", lambda x: x.nunique()),
    AvgTransactionValue=("TotalValue", "mean")
).reset_index()

In [6]:
# Normalize the features
scaler = StandardScaler()
normalized_features = scaler.fit_transform(customer_features.iloc[:, 1:])  # Exclude CustomerID for normalization


In [7]:
# Compute cosine similarity
similarity_matrix = cosine_similarity(normalized_features)

In [8]:
# Find top 3 lookalikes for the first 20 customers
lookalike_results = {}
for idx in range(20):  # First 20 customers
    customer_id = customer_features.iloc[idx, 0]  # Get the CustomerID
    scores = list(enumerate(similarity_matrix[idx]))
    sorted_scores = sorted(scores, key=lambda x: x[1], reverse=True)[1:4]  # Exclude the customer itself
    lookalike_results[customer_id] = [(customer_features.iloc[i, 0], score) for i, score in sorted_scores]


In [9]:
# Create a DataFrame for Lookalike.csv
lookalike_data = []
for cust_id, lookalikes in lookalike_results.items():
    row = {
        "CustomerID": cust_id,
        "Lookalike1": lookalikes[0][0],
        "Score1": lookalikes[0][1],
        "Lookalike2": lookalikes[1][0],
        "Score2": lookalikes[1][1],
        "Lookalike3": lookalikes[2][0],
        "Score3": lookalikes[2][1],
    }
    lookalike_data.append(row)

lookalike_df = pd.DataFrame(lookalike_data)

# Save the results to Lookalike.csv
lookalike_df.to_csv("Nikita_Chauhan_Lookalike.csv", index=False)

# Output the Lookalike DataFrame
print(lookalike_df)


   CustomerID Lookalike1    Score1 Lookalike2    Score2 Lookalike3    Score3
0       C0001      C0164  0.971775      C0137  0.954396      C0103  0.946100
1       C0002      C0029  0.999543      C0031  0.997827      C0077  0.995802
2       C0003      C0027  0.838993      C0176  0.827958      C0073  0.765272
3       C0004      C0075  0.997562      C0175  0.993002      C0195  0.992907
4       C0005      C0123  0.998486      C0063  0.998121      C0095  0.996462
5       C0006      C0079  0.999984      C0117  0.998490      C0196  0.993010
6       C0007      C0140  0.998372      C0085  0.996300      C0080  0.992652
7       C0008      C0090  0.988008      C0179  0.986864      C0084  0.974458
8       C0009      C0192  0.996488      C0083  0.992422      C0043  0.992166
9       C0010      C0029  0.975250      C0094  0.974407      C0002  0.970414
10      C0011      C0064  0.993094      C0037  0.987955      C0171  0.986643
11      C0012      C0102  0.992214      C0045  0.987785      C0041  0.985908

In [10]:
# Import necessary libraries
from sklearn.metrics import mean_squared_error
import numpy as np

# Function to evaluate quality of recommendations
def evaluate_lookalike_model(similarity_matrix, customer_features, lookalike_results):
    # Extract the ground truth for validation (use a small test subset for manual evaluation or business logic validation)
    # Here, we assume that customers with similar transaction patterns should have similar TotalSpend and TotalQuantity
    
    # Initialize lists to track the evaluation metrics
    mse_scores = []
    similarity_score_sums = []

    for customer_id, recommendations in lookalike_results.items():
        # Get the features of the main customer
        main_customer_features = customer_features[customer_features['CustomerID'] == customer_id].iloc[:, 1:].values
        
        for rec_customer_id, similarity_score in recommendations:
            # Get the features of the recommended customer
            rec_customer_features = customer_features[customer_features['CustomerID'] == rec_customer_id].iloc[:, 1:].values

            # Compute the Mean Squared Error between features as a measure of similarity
            mse = mean_squared_error(main_customer_features, rec_customer_features)
            mse_scores.append(mse)
            
            # Track the similarity scores
            similarity_score_sums.append(similarity_score)

    # Average MSE
    avg_mse = np.mean(mse_scores)
    print(f"Average Mean Squared Error (MSE) of Feature Differences: {avg_mse:.4f}")

    # Average Similarity Score
    avg_similarity_score = np.mean(similarity_score_sums)
    print(f"Average Similarity Score: {avg_similarity_score:.4f}")

    # Quality Assessment Logic
    if avg_mse < 0.1 and avg_similarity_score > 0.8:
        print("The model provides high-quality recommendations based on the defined thresholds.")
    elif avg_mse < 0.2 and avg_similarity_score > 0.7:
        print("The model provides good recommendations, but there is room for improvement.")
    else:
        print("The model's recommendations need significant improvement.")

# Evaluate the logic of recommendations
def logic_check_lookalike_results(lookalike_results, similarity_matrix):
    """
    Logic validation:
    - Ensure that no customer is recommended as their own lookalike.
    - Check if the similarity scores are monotonically decreasing for each recommendation list.
    """
    passed_logic = True
    for customer_id, recommendations in lookalike_results.items():
        # Check no self-recommendations
        for rec_customer_id, _ in recommendations:
            if customer_id == rec_customer_id:
                print(f"Logic Error: Customer {customer_id} is recommended as their own lookalike.")
                passed_logic = False

        # Check similarity score order
        scores = [rec[1] for rec in recommendations]
        if not all(scores[i] >= scores[i + 1] for i in range(len(scores) - 1)):
            print(f"Logic Error: Similarity scores for Customer {customer_id} are not in descending order.")
            passed_logic = False

    if passed_logic:
        print("All logic checks passed successfully.")
    else:
        print("Logic checks failed. Review the recommendations.")

# Run evaluation on lookalike model
evaluate_lookalike_model(similarity_matrix, customer_features, lookalike_results)
logic_check_lookalike_results(lookalike_results, similarity_matrix)


Average Mean Squared Error (MSE) of Feature Differences: 210926.0958
Average Similarity Score: 0.9765
The model's recommendations need significant improvement.
All logic checks passed successfully.
