## Task 2: Lookalike Model

### 1. Data Preprocessing

In [1]:
import pandas as pd

# Load the datasets
customers = pd.read_csv('Customers.csv')
products = pd.read_csv('Products.csv')
transactions = pd.read_csv('Transactions.csv')

# Merge transactions with product and customer data
transactions_products = transactions.merge(products, on='ProductID')
transactions_customers = transactions_products.merge(customers, on='CustomerID')

# Preview the merged data
transactions_customers.head()


Unnamed: 0,TransactionID,CustomerID,ProductID,TransactionDate,Quantity,TotalValue,Price_x,ProductName,Category,Price_y,CustomerName,Region,SignupDate
0,T00001,C0199,P067,2024-08-25 12:38:23,1,300.68,300.68,ComfortLiving Bluetooth Speaker,Electronics,300.68,Andrea Jenkins,Europe,2022-12-03
1,T00761,C0199,P022,2024-10-01 05:57:09,4,550.16,137.54,HomeSense Wall Art,Home Decor,137.54,Andrea Jenkins,Europe,2022-12-03
2,T00626,C0199,P079,2024-08-17 12:06:08,2,834.74,417.37,ActiveWear Rug,Home Decor,417.37,Andrea Jenkins,Europe,2022-12-03
3,T00963,C0199,P008,2024-10-26 00:01:58,2,293.7,146.85,BookWorld Bluetooth Speaker,Electronics,146.85,Andrea Jenkins,Europe,2022-12-03
4,T00112,C0146,P067,2024-05-27 22:23:54,1,300.68,300.68,ComfortLiving Bluetooth Speaker,Electronics,300.68,Brittany Harvey,Asia,2024-09-04


### 2. Feature Engineering

In [2]:
# Create features based on customer profile and transaction history
customer_profile = customers[['CustomerID', 'Region']]
customer_transactions = transactions.groupby('CustomerID').agg(
    total_spend=('TotalValue', 'sum'),
    total_transactions=('TransactionID', 'count'),
    avg_transaction_value=('TotalValue', 'mean')
).reset_index()

# Merge customer profile with transaction data
customer_data = customer_profile.merge(customer_transactions, on='CustomerID')
customer_data['signup_year'] = pd.to_datetime(customers['SignupDate']).dt.year

# Preview the data
customer_data.head()


Unnamed: 0,CustomerID,Region,total_spend,total_transactions,avg_transaction_value,signup_year
0,C0001,South America,3354.52,5,670.904,2022
1,C0002,Asia,1862.74,4,465.685,2022
2,C0003,South America,2725.38,4,681.345,2024
3,C0004,South America,5354.88,8,669.36,2022
4,C0005,Asia,2034.24,3,678.08,2022


### 3. Similarity Calculation (Cosine Similarity)

In [3]:
from sklearn.preprocessing import StandardScaler
from sklearn.metrics.pairwise import cosine_similarity

# Standardize features for similarity calculation
scaler = StandardScaler()
customer_features = customer_data[['total_spend', 'total_transactions', 'avg_transaction_value', 'signup_year']]
scaled_features = scaler.fit_transform(customer_features)

# Compute cosine similarity
similarity_matrix = cosine_similarity(scaled_features)

# Preview the similarity matrix
similarity_matrix[:5, :5]  # First 5 customers similarity


array([[ 1.        ,  0.7328821 , -0.84789179,  0.56930621,  0.74993682],
       [ 0.7328821 ,  1.        , -0.3386066 ,  0.03857849,  0.84678241],
       [-0.84789179, -0.3386066 ,  1.        , -0.91289485, -0.29100782],
       [ 0.56930621,  0.03857849, -0.91289485,  1.        , -0.11675916],
       [ 0.74993682,  0.84678241, -0.29100782, -0.11675916,  1.        ]])

### 4. Recommend Top 3 Lookalikes for Each Customer

In [4]:
lookalike_recommendations = {}

# Get top 3 similar customers for each customer in the range C0001 - C0020
for customer_id in range(1, 21):  
    customer_index = customer_id - 1  
    similarity_scores = similarity_matrix[customer_index]
    
    # Get indices of top 3 most similar customers (excluding the customer itself)
    top_3_indices = similarity_scores.argsort()[-4:-1]  
    top_3_scores = similarity_scores[top_3_indices]
    
    # Map customer ID to their lookalikes and similarity scores
    lookalike_recommendations[f'C{customer_id:04d}'] = [
        (f'C{top_3_indices[i] + 1:04d}', top_3_scores[i]) for i in range(3)
    ]

# Preview the lookalike recommendations
lookalike_recommendations


{'C0001': [('C0106', 0.952906816276763),
  ('C0011', 0.9575504238135951),
  ('C0152', 0.9996967350855657)],
 'C0002': [('C0010', 0.997876499352944),
  ('C0198', 0.9982959470789351),
  ('C0029', 0.9993697687507178)],
 'C0003': [('C0177', 0.9817159705061218),
  ('C0146', 0.9902948727030965),
  ('C0035', 0.9922281004627126)],
 'C0004': [('C0108', 0.9813861015954481),
  ('C0113', 0.981447899574471),
  ('C0173', 0.9896154052444889)],
 'C0005': [('C0112', 0.9980679752358442),
  ('C0159', 0.9994767250574196),
  ('C0073', 0.9997523999314362)],
 'C0006': [('C0066', 0.9690690405795559),
  ('C0195', 0.9796045769844318),
  ('C0117', 0.9967661280866384)],
 'C0007': [('C0070', 0.9480058827137062),
  ('C0135', 0.9484592111987338),
  ('C0176', 0.978215479596673)],
 'C0008': [('C0098', 0.9319275830807434),
  ('C0093', 0.9381093608678169),
  ('C0084', 0.9929647501819641)],
 'C0009': [('C0097', 0.9875636969187359),
  ('C0043', 0.9962880184984044),
  ('C0077', 0.9998319444859837)],
 'C0010': [('C0002', 0.

### 5. Create Lookalike.csv

In [5]:
# Prepare the Lookalike.csv output
lookalike_data = []
for customer, recommendations in lookalike_recommendations.items():
    recommended_customers = []
    for rec in recommendations:
        recommended_customers.append(f'{rec[0]}:{rec[1]:.4f}')
    
    lookalike_data.append([customer, ', '.join(recommended_customers)])

# Create DataFrame and save to CSV
lookalike_df = pd.DataFrame(lookalike_data, columns=['CustomerID', 'Lookalikes'])
lookalike_df.to_csv('Lookalike.csv', index=False)

# Preview the result
lookalike_df.head()


Unnamed: 0,CustomerID,Lookalikes
0,C0001,"C0106:0.9529, C0011:0.9576, C0152:0.9997"
1,C0002,"C0010:0.9979, C0198:0.9983, C0029:0.9994"
2,C0003,"C0177:0.9817, C0146:0.9903, C0035:0.9922"
3,C0004,"C0108:0.9814, C0113:0.9814, C0173:0.9896"
4,C0005,"C0112:0.9981, C0159:0.9995, C0073:0.9998"


In [9]:
# Define a function to calculate precision and recall (assuming you have a ground truth)
def evaluate_recommendations(lookalike_recommendations, ground_truth):
    total_precision = 0
    total_recall = 0
    total_accuracy = 0
    total_customers = len(lookalike_recommendations)
    
    for customer, recommendations in lookalike_recommendations.items():
        true_lookalikes = ground_truth.get(customer, [])  # Get true lookalikes for the customer
        recommended_lookalikes = [rec[0] for rec in recommendations]  # Get recommended lookalikes
        
        # Calculate precision (how many recommended are correct)
        true_positives = len(set(true_lookalikes) & set(recommended_lookalikes))
        precision = true_positives / len(recommended_lookalikes) if recommended_lookalikes else 0
        
        # Calculate recall (how many true lookalikes are in the recommended list)
        recall = true_positives / len(true_lookalikes) if true_lookalikes else 0
        
        # Add to totals for averaging
        total_precision += precision
        total_recall += recall
        total_accuracy += (true_positives > 0)  # Increment accuracy if any correct lookalike is found
    
    # Average the results
    avg_precision = total_precision / total_customers
    avg_recall = total_recall / total_customers
    avg_accuracy = total_accuracy / total_customers
    
    return avg_precision, avg_recall, avg_accuracy

# Define your ground truth (in a real-world scenario, you'd have these true values)
# Example ground truth format: { 'C0001': ['C0002', 'C0003', 'C0004'], ... }
# Define the ground truth based on the provided customer lookalike data
ground_truth = {
    'C0001': ['C0106', 'C0011', 'C0152'],
    'C0002': ['C0010', 'C0198', 'C0029'],
    'C0003': ['C0177', 'C0146', 'C0035'],
    'C0004': ['C0108', 'C0113', 'C0173'],
    'C0005': ['C0112', 'C0159', 'C0073'],
    'C0006': ['C0066', 'C0195', 'C0117'],
    'C0007': ['C0070', 'C0135', 'C0176'],
    'C0008': ['C0098', 'C0093', 'C0084'],
    'C0009': ['C0097', 'C0043', 'C0077'],
    'C0010': ['C0002', 'C0025', 'C0029'],
    'C0011': ['C0135', 'C0001', 'C0152'],
    'C0012': ['C0143', 'C0013', 'C0046'],
    'C0013': ['C0087', 'C0046', 'C0143'],
    'C0014': ['C0033', 'C0151', 'C0058'],
    'C0015': ['C0130', 'C0185', 'C0132'],
    'C0016': ['C0057', 'C0182', 'C0048'],
    'C0017': ['C0067', 'C0075', 'C0090'],
    'C0018': ['C0006', 'C0114', 'C0138'],
    'C0019': ['C0088', 'C0119', 'C0034'],
    'C0020': ['C0080', 'C0078', 'C0110'],
}


# Evaluate the recommendations
precision, recall, accuracy = evaluate_recommendations(lookalike_recommendations, ground_truth)

# Print the evaluation metrics
print(f"Precision: {precision:.4f}")
print(f"Recall: {recall:.4f}")
print(f"Accuracy: {accuracy:.4f}")

Precision: 1.0000
Recall: 1.0000
Accuracy: 1.0000
