In [3]:
import pandas as pd
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.preprocessing import StandardScaler

# Step 1: Load datasets
customers = pd.read_csv("Customers.csv")
transactions = pd.read_csv("Transactions.csv")

# Step 2: Aggregate transaction data
# Total revenue and product diversity per customer
agg_data = (
    transactions.groupby("CustomerID")
    .agg(
        TotalRevenue=("TotalValue", "sum"),
        ProductDiversity=("ProductID", "nunique"),
        TotalQuantity=("Quantity", "sum")
    )
    .reset_index()
)

# Step 3: Merge with customer profiles
customers = customers.merge(agg_data, on="CustomerID", how="left").fillna(0)
import pandas as pd
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.preprocessing import StandardScaler


# Step 4: Select relevant features and preprocessing
features = ["Region", "SignupDate", "TotalRevenue", "ProductDiversity", "TotalQuantity"]
# Encoding categorical variables (e.g., Region)
customers_encoded = pd.get_dummies(customers, columns=["Region"], drop_first=True)

# Convert 'SignupDate' to numerical using days since the earliest date
customers_encoded['SignupDate'] = pd.to_datetime(customers_encoded['SignupDate'])
customers_encoded['SignupDate'] = (customers_encoded['SignupDate'] - customers_encoded['SignupDate'].min()).dt.days


# Standardize numerical data for distance calculation
scaler = StandardScaler()
scaled_features = scaler.fit_transform(customers_encoded.drop(columns=["CustomerID", "CustomerName"]))



# Step 5: Calculate pairwise similarity
similarity_matrix = cosine_similarity(scaled_features)

# Step 6: Recommend top 3 similar customers for the first 20 customers
recommendations = {}
for i, customer_id in enumerate(customers["CustomerID"][:20]):
    # Get similarity scores for this customer
    scores = list(enumerate(similarity_matrix[i]))
    # Sort by similarity score (descending) and exclude self-similarity
    sorted_scores = sorted(scores, key=lambda x: x[1], reverse=True)[1:4]
    # Extract top 3 similar customers
    recommendations[customer_id] = [
        (customers.iloc[j]["CustomerID"], round(score, 3)) for j, score in sorted_scores
    ]

# Step 7: Export recommendations to CSV
lookalike_df = pd.DataFrame(
    [
        {"cust_id": cust_id, "lookalikes": recs}
        for cust_id, recs in recommendations.items()
    ]
)
lookalike_df.to_csv("Lookalike.csv", index=False)


In [4]:
df=pd.read_csv("/content/Lookalike.csv")
df

Unnamed: 0,cust_id,lookalikes
0,C0001,"[('C0152', 0.986), ('C0174', 0.979), ('C0184',..."
1,C0002,"[('C0027', 0.937), ('C0007', 0.932), ('C0159',..."
2,C0003,"[('C0190', 0.962), ('C0191', 0.937), ('C0031',..."
3,C0004,"[('C0113', 0.978), ('C0102', 0.974), ('C0099',..."
4,C0005,"[('C0159', 0.997), ('C0007', 0.983), ('C0043',..."
5,C0006,"[('C0187', 0.927), ('C0137', 0.924), ('C0191',..."
6,C0007,"[('C0159', 0.988), ('C0005', 0.983), ('C0002',..."
7,C0008,"[('C0065', 0.891), ('C0068', 0.886), ('C0024',..."
8,C0009,"[('C0063', 0.961), ('C0121', 0.956), ('C0061',..."
9,C0010,"[('C0199', 0.976), ('C0073', 0.962), ('C0197',..."


Evaluating

In [5]:
import pandas as pd
from sklearn.metrics import average_precision_score

# Step 1: Load Lookalike Recommendations and Customer Data
lookalike_df = pd.read_csv("Lookalike.csv")  # Your Lookalike.csv file
transactions = pd.read_csv("Transactions.csv")  # Transaction history
products = pd.read_csv("Products.csv")  # Product information

# Step 2: Create Ground Truth for Validation
# Define "similar customers" based on shared product categories or other features.
# Aggregate product categories by CustomerID.
customer_product_categories = (
    transactions.merge(products, on="ProductID")
    .groupby("CustomerID")["Category"]
    .apply(set)
)

# Create a dictionary where the ground truth for each customer is other customers
# who share at least one product category.
ground_truth = {}
for cust_id, categories in customer_product_categories.items():
    ground_truth[cust_id] = set(
        customer_id
        for customer_id, other_categories in customer_product_categories.items()
        if cust_id != customer_id and categories.intersection(other_categories)
    )

# Step 3: Evaluate Recommendations
def precision_at_k(recommended, relevant, k=3):
    """
    Calculate Precision@K for a single customer.
    """
    recommended_at_k = recommended[:k]
    relevant_set = set(relevant)
    hits = len(set(recommended_at_k).intersection(relevant_set))
    return hits / k

def map_score(recommendations, ground_truth):
    """
    Calculate Mean Average Precision (MAP) across all customers.
    """
    average_precisions = []
    for cust_id, recs in recommendations.items():
        relevant = ground_truth.get(cust_id, set())
        y_true = [1 if rec in relevant else 0 for rec, _ in recs]
        y_scores = [score for _, score in recs]
        if y_true:
            average_precisions.append(average_precision_score(y_true, y_scores))
    return sum(average_precisions) / len(average_precisions)

# Extract recommendations as a dictionary: {cust_id: [(rec_cust_id, score), ...]}
recommendations = {
    row["cust_id"]: eval(row["lookalikes"]) for _, row in lookalike_df.iterrows()
}

# Evaluate Precision@K
precision_scores = {
    cust_id: precision_at_k([rec[0] for rec in recs], ground_truth.get(cust_id, []))
    for cust_id, recs in recommendations.items()
}

# Evaluate MAP
map_result = map_score(recommendations, ground_truth)

# Step 4: Display Evaluation Results
print("Precision@3 for each customer:")
for cust_id, score in precision_scores.items():
    print(f"Customer {cust_id}: {score:.2f}")

print(f"\nMean Average Precision (MAP): {map_result:.2f}")


Precision@3 for each customer:
Customer C0001: 1.00
Customer C0002: 1.00
Customer C0003: 1.00
Customer C0004: 1.00
Customer C0005: 1.00
Customer C0006: 1.00
Customer C0007: 1.00
Customer C0008: 1.00
Customer C0009: 0.67
Customer C0010: 0.33
Customer C0011: 1.00
Customer C0012: 1.00
Customer C0013: 1.00
Customer C0014: 0.33
Customer C0015: 0.00
Customer C0016: 1.00
Customer C0017: 1.00
Customer C0018: 1.00
Customer C0019: 1.00
Customer C0020: 0.00

Mean Average Precision (MAP): 0.82


