# 1)Importing necessary libraries

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

# 2)Loading the datasets

In [2]:
customers = pd.read_csv("C:/Users/srava/OneDrive/Desktop/Customers.csv")
products = pd.read_csv("C:/Users/srava/OneDrive/Desktop/Products.csv")
transactions = pd.read_csv("C:/Users/srava/OneDrive/Desktop/Transactions.csv")

# 3)Data Preparation

In [3]:
# Merging customers data with transactions and product data to create a unified dataset
customer_data = customers.merge(transactions, on='CustomerID', how='left')
customer_data = customer_data.merge(products, on='ProductID', how='left')

# Now we have a dataset that includes CustomerID, ProductID, Transaction details, Product details, etc.


In [4]:
customer_data.head(5)

Unnamed: 0,CustomerID,CustomerName,Region,SignupDate,TransactionID,ProductID,TransactionDate,Quantity,TotalValue,Price_x,ProductName,Category,Price_y
0,C0001,Lawrence Carroll,South America,2022-07-10,T00015,P054,2024-01-19 3:12:55,2.0,114.6,57.3,SoundWave Cookbook,Books,57.3
1,C0001,Lawrence Carroll,South America,2022-07-10,T00932,P022,2024-09-17 9:01:18,3.0,412.62,137.54,HomeSense Wall Art,Home Decor,137.54
2,C0001,Lawrence Carroll,South America,2022-07-10,T00085,P096,2024-04-08 0:01:00,2.0,614.94,307.47,SoundWave Headphones,Electronics,307.47
3,C0001,Lawrence Carroll,South America,2022-07-10,T00445,P083,2024-05-07 3:11:44,2.0,911.44,455.72,ActiveWear Smartwatch,Electronics,455.72
4,C0001,Lawrence Carroll,South America,2022-07-10,T00436,P029,2024-11-02 17:04:16,3.0,1300.92,433.64,TechPro Headphones,Electronics,433.64


In [5]:
customer_data.isnull().sum()

CustomerID         0
CustomerName       0
Region             0
SignupDate         0
TransactionID      1
ProductID          1
TransactionDate    1
Quantity           1
TotalValue         1
Price_x            1
ProductName        1
Category           1
Price_y            1
dtype: int64

In [8]:
print(customer_data.duplicated().sum())

0


In [9]:
# Get all rows where any column has a null value
rows_with_null = customer_data[customer_data.isnull().any(axis=1)]

# Display the rows with missing values
print(rows_with_null)


    CustomerID   CustomerName Region  SignupDate TransactionID ProductID  \
904      C0180  Amy Carpenter   Asia  2023-10-25           NaN       NaN   

    TransactionDate  Quantity  TotalValue  Price_x ProductName Category  \
904             NaN       NaN         NaN      NaN         NaN      NaN   

     Price_y  
904      NaN  


# 4)Feature Engineering

In [11]:
# Feature engineering: Aggregating transaction data per customer
customer_profile = customer_data.groupby('CustomerID').agg(
    total_spending=('TotalValue', 'sum'),
    num_transactions=('TransactionID', 'nunique'),
    num_products=('ProductID', 'nunique'),
    most_frequent_category=('Category', lambda x: x.mode().iloc[0] if not x.mode().empty else "Unknown")
).reset_index()


# 4)Similarity Calculation

In [12]:
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.preprocessing import StandardScaler

# Normalize the features for cosine similarity calculation
scaler = StandardScaler()
customer_features = customer_profile[['total_spending', 'num_transactions', 'num_products']]

# Standardize the features
customer_features_scaled = scaler.fit_transform(customer_features)

# Compute cosine similarity between customers
similarity_matrix = cosine_similarity(customer_features_scaled)


# 5)Generate Recommendations

In [31]:
# Prepare the results
lookalike_recommendations = {}

# For customers C0001 to C0020, find the top 3 similar customers
for i in range(20):  # Assuming first 20 customers (C0001 to C0020)
    customer_id = f"C{str(i+1).zfill(4)}"
    customer_idx = customer_profile[customer_profile['CustomerID'] == customer_id].index[0]
    
    # Get similarity scores for the customer
    similarity_scores = similarity_matrix[customer_idx]
    
    # Get indices of the top 3 most similar customers (excluding the customer themselves)
    similar_customer_indices = similarity_scores.argsort()[-4:-1][::-1]  # Exclude self and sort by similarity
    
    # Format recommended customers as "C000X:-85" with scores out of 100
    recommended_customers = [f"{customer_profile.iloc[idx]['CustomerID']}:-{int(round(similarity_scores[idx] * 100))}"
                             for idx in similar_customer_indices]
    
    # Store in dictionary in the required format (comma-separated)
    lookalike_recommendations[customer_id] = ", ".join(recommended_customers)

# Convert to DataFrame
lookalike_df_grouped = pd.DataFrame(lookalike_recommendations.items(), columns=['CustomerID', 'Recommendations'])

# Save the results to Lookalike.csv
lookalike_df_grouped.to_csv('C:/Users/srava/OneDrive/Desktop/Lookalike.csv', index=False)

print("Lookalike.csv created in the required format!")



Lookalike.csv created in the required format!


# 6)Model Evaluation 

In [27]:
import numpy as np
from sklearn.metrics import average_precision_score

# Example: Ground truth similar customers for each customer
true_lookalikes = {
    'C0001': ['C0003', 'C0005', 'C0008'],
    'C0002': ['C0012', 'C0017', 'C0004'],
    'C0003': ['C0001', 'C0008', 'C0009'],
    # Add more ground truth values...
}

# Example: Model's recommended similar customers
predicted_lookalikes = {
    'C0001': ['C0003', 'C0005', 'C0007'],
    'C0002': ['C0012', 'C0004', 'C0009'],
    'C0003': ['C0001', 'C0009', 'C0010'],
    # Add more predictions...
}

def calculate_map(true_lookalikes, predicted_lookalikes):
    """Compute Mean Average Precision (MAP)"""
    map_scores = []
    
    for cust_id, true_customers in true_lookalikes.items():
        if cust_id in predicted_lookalikes:
            pred_customers = predicted_lookalikes[cust_id]
            
            # Convert to binary relevance (1 if relevant, 0 otherwise)
            y_true = [1 if cust in true_customers else 0 for cust in pred_customers]
            
            # Compute AP for this customer
            if sum(y_true) > 0:  # Ensure there is at least one relevant item
                ap_score = average_precision_score(y_true, y_true)
                map_scores.append(ap_score)
    
    return np.mean(map_scores) if map_scores else 0

# Compute MAP
map_score = calculate_map(true_lookalikes, predicted_lookalikes)
print(f"Mean Average Precision (MAP): {map_score*100:.4f}")




Mean Average Precision (MAP): 100.0000


In [29]:
def accuracy_out_of_100(true_lookalikes, predicted_lookalikes, k=3):
    """Compute Accuracy out of 100 for top-k recommendations"""
    correct_predictions = 0
    total_predictions = 0
    
    for cust_id, true_customers in true_lookalikes.items():
        if cust_id in predicted_lookalikes:
            pred_customers = predicted_lookalikes[cust_id][:k]
            correct_predictions += len(set(pred_customers) & set(true_customers))
            total_predictions += k
    
    # Calculate accuracy as percentage
    accuracy_percentage = (correct_predictions / total_predictions) * 100 if total_predictions > 0 else 0
    return accuracy_percentage

# Compute accuracy
accuracy = accuracy_out_of_100(true_lookalikes, predicted_lookalikes, k=3)
print(f"Accuracy: {accuracy:.2f}%")


Accuracy: 66.67%


In [24]:
def precision_at_k(true_lookalikes, predicted_lookalikes, k=3):
    """Compute Precision@K"""
    precisions = []
    
    for cust_id, true_customers in true_lookalikes.items():
        if cust_id in predicted_lookalikes:
            pred_customers = predicted_lookalikes[cust_id][:k]
            precision = len(set(pred_customers) & set(true_customers)) / k
            precisions.append(precision)
    
    return np.mean(precisions) if precisions else 0

def mean_reciprocal_rank(true_lookalikes, predicted_lookalikes):
    """Compute Mean Reciprocal Rank (MRR)"""
    mrr_scores = []
    
    for cust_id, true_customers in true_lookalikes.items():
        if cust_id in predicted_lookalikes:
            pred_customers = predicted_lookalikes[cust_id]
            
            # Find the rank of the first relevant customer
            for rank, cust in enumerate(pred_customers, start=1):
                if cust in true_customers:
                    mrr_scores.append(1 / rank)
                    break
    
    return np.mean(mrr_scores) if mrr_scores else 0

# Compute additional metrics
precision_k = precision_at_k(true_lookalikes, predicted_lookalikes, k=3)
mrr_score = mean_reciprocal_rank(true_lookalikes, predicted_lookalikes)

print(f"Precision@3: {precision_k*100:.4f}")
print(f"Mean Reciprocal Rank (MRR): {mrr_score*100:.4f}")


Precision@3: 66.6667
Mean Reciprocal Rank (MRR): 100.0000
