In [1]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [2]:
import pandas as pd
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
import numpy as np
import matplotlib.pyplot as plt

In [3]:
# Read the CSV file into a DataFrame
customers = pd.read_csv('/content/drive/MyDrive/Zeotap/Customers.csv')
products = pd.read_csv('/content/drive/MyDrive/Zeotap/Products.csv')
transactions= pd.read_csv('/content/drive/MyDrive/Zeotap/Transactions.csv')

In [4]:
customers.head(5)

Unnamed: 0,CustomerID,CustomerName,Region,SignupDate
0,C0001,Lawrence Carroll,South America,2022-07-10
1,C0002,Elizabeth Lutz,Asia,2022-02-13
2,C0003,Michael Rivera,South America,2024-03-07
3,C0004,Kathleen Rodriguez,South America,2022-10-09
4,C0005,Laura Weber,Asia,2022-08-15


In [5]:
products.head(5)

Unnamed: 0,ProductID,ProductName,Category,Price
0,P001,ActiveWear Biography,Books,169.3
1,P002,ActiveWear Smartwatch,Electronics,346.3
2,P003,ComfortLiving Biography,Books,44.12
3,P004,BookWorld Rug,Home Decor,95.69
4,P005,TechPro T-Shirt,Clothing,429.31


In [6]:
transactions.head(5)

Unnamed: 0,TransactionID,CustomerID,ProductID,TransactionDate,Quantity,TotalValue,Price
0,T00001,C0199,P067,2024-08-25 12:38:23,1,300.68,300.68
1,T00112,C0146,P067,2024-05-27 22:23:54,1,300.68,300.68
2,T00166,C0127,P067,2024-04-25 07:38:55,1,300.68,300.68
3,T00272,C0087,P067,2024-03-26 22:55:37,2,601.36,300.68
4,T00363,C0070,P067,2024-03-21 15:10:10,3,902.04,300.68


###**1. Lookalike Customer Map Based On Cosine Similarity**

In [7]:
# Merge data
merged_data = transactions.merge(customers, on='CustomerID', how='left') \
                          .merge(products, on='ProductID', how='left')

# Aggregate transaction data for each customer
customer_profile = merged_data.groupby('CustomerID').agg({
    'Region': 'first',
    'TotalValue': 'sum',
    'Quantity': 'sum',
    'Category': lambda x: x.value_counts().idxmax()  # Most frequent category
}).reset_index()

# Extract features
features = customer_profile[['TotalValue', 'Quantity']]
features = pd.get_dummies(features)  # Handle categorical variables if needed

# Normalize the features
scaler = StandardScaler()
features_scaled = scaler.fit_transform(features)

# Compute cosine similarity
cos_sim = cosine_similarity(features_scaled)

# Function to get top 3 lookalikes
def get_top_lookalikes(customer_id, top_n=3):
    customer_idx = customers[customers['CustomerID'] == customer_id].index[0]
    sim_scores = list(enumerate(cos_sim[customer_idx]))
    sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)[1:top_n+1]  # Exclude the customer itself
    return [(customers.iloc[i[0]]['CustomerID'], i[1]) for i in sim_scores]

# Generate lookalike map
lookalike_map = {}
for customer_id in customers['CustomerID'][:20]:  # First 20 customers
    lookalike_map[customer_id] = get_top_lookalikes(customer_id)

# Save the lookalike results
lookalike_df = pd.DataFrame([
    {'CustomerID': key, 'Lookalikes': value} for key, value in lookalike_map.items()
])

lookalike_df.to_csv('Lookalike.csv', index=False)

# Print results
lookalike_df.head(5)


Unnamed: 0,CustomerID,Lookalikes
0,C0001,"[(C0085, 0.9999990504724361), (C0042, 0.999821..."
1,C0002,"[(C0157, 0.9999942410168485), (C0166, 0.999875..."
2,C0003,"[(C0111, 0.9940081095432594), (C0160, 0.990454..."
3,C0004,"[(C0162, 0.9999999965087093), (C0165, 0.999959..."
4,C0005,"[(C0080, 0.9999822355480511), (C0167, 0.999975..."


###**2. Function to recommend similar customers based on user input**

In [8]:
# Taking user input for a new customer (simulate new customer entry)
def get_user_input():
    print("Enter your details to find similar customers:")

    # Ask user for transaction data (you can expand this to include more data)
    total_value = float(input("Total transaction value: "))
    quantity = int(input("Total quantity of products purchased: "))
    region = input("Region(ex: South America,Asia,North America,Europe): ")

    # Create a DataFrame for the user
    user_data = pd.DataFrame({
        'CustomerID': [0],  # Placeholder for new customer
        'TotalValue': [total_value],
        'Quantity': [quantity],
        'Region': [region],
    })

    # Convert user input into similar format to training data
    user_data_dummies = pd.get_dummies(user_data[['TotalValue', 'Quantity']])  # Ensure this matches the model
    user_data_scaled = scaler.transform(user_data_dummies)  # Scale the user's input

    # Calculate similarity with existing customers
    sim_scores = cosine_similarity(user_data_scaled, features_scaled)
    return sim_scores.flatten()


In [9]:
# Function to recommend similar customers based on user input
def recommend_similar_customers():
    sim_scores = get_user_input()

    # Sort by similarity score and exclude the user themselves (index 0)
    similar_customers_idx = np.argsort(sim_scores)[::-1][1:4]  # Top 3 similar customers

    print("\nTop 3 similar customers:")
    for idx in similar_customers_idx:
        customer_id = customers.iloc[idx]['CustomerID']
        similarity = sim_scores[idx]
        print(f"CustomerID: {customer_id}, Similarity Score: {similarity:.4f}")


#taking user input
recommend_similar_customers()

Enter your details to find similar customers:
Total transaction value: 506
Total quantity of products purchased: 2
Region(ex: South America,Asia,North America,Europe): Asia

Top 3 similar customers:
CustomerID: C0063, Similarity Score: 0.9999
CustomerID: C0095, Similarity Score: 0.9999
CustomerID: C0150, Similarity Score: 0.9999


###**3. Evaluation**

In [10]:
# Evaluate model accuracy based on average cosine similarity between all customer pairs
def evaluate_cosine_similarity():
    # Calculate the average similarity (excluding self-similarity)
    n_customers = cos_sim.shape[0]
    total_similarity = 0
    count = 0

    for i in range(n_customers):
        for j in range(i + 1, n_customers):  # avoid double counting
            total_similarity += cos_sim[i, j]
            count += 1

    avg_similarity = total_similarity / count
    print(f'Average Cosine Similarity between customers: {avg_similarity:.4f}')

# Call the function to evaluate model
evaluate_cosine_similarity()


Average Cosine Similarity between customers: 0.0084


In [11]:
# Evaluate the quality of recommendations by checking if the similarity score exceeds a threshold
def evaluate_recommendations(threshold=0.8):
    correct_recommendations = 0
    total_recommendations = 0

    # Iterate over all customers and evaluate the top N lookalikes
    for customer_id in customers['CustomerID'][:20]:  # You can adjust the range here
        for lookalike_id, similarity in lookalike_map[customer_id]:
            if similarity >= threshold:
                correct_recommendations += 1
            total_recommendations += 1

    recommendation_accuracy = correct_recommendations / total_recommendations
    print(f'Recommendation Accuracy (at threshold {threshold}): {recommendation_accuracy:.4f}')

# Call the function to evaluate recommendations
evaluate_recommendations()


Recommendation Accuracy (at threshold 0.8): 1.0000
