In [2]:
import pandas as pd
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.preprocessing import StandardScaler, OneHotEncoder

# Load datasets
customers = pd.read_csv("Customers.csv")
products = pd.read_csv("Products.csv")
transactions = pd.read_csv("Transactions.csv")

# Merge datasets
merged_data = pd.merge(transactions, customers, on="CustomerID")
merged_data = pd.merge(merged_data, products, on="ProductID")

# Create customer profiles
customer_profiles = merged_data.groupby("CustomerID").agg({
    "TotalValue": "sum",  # Total spend
    "Quantity": "sum",  # Total quantity purchased
    "Category": lambda x: x.mode()[0],  # Favorite category
    "Region": "first"  # Customer region
}).reset_index()

# Encode categorical variables
encoder = OneHotEncoder()
encoded_features = encoder.fit_transform(customer_profiles[["Category", "Region"]]).toarray()
encoded_df = pd.DataFrame(encoded_features, columns=encoder.get_feature_names_out(["Category", "Region"]))

# Combine numerical and encoded categorical features
customer_features = pd.concat([customer_profiles[["CustomerID", "TotalValue", "Quantity"]], encoded_df], axis=1)

# Normalize numerical features
scaler = StandardScaler()
scaled_features = scaler.fit_transform(customer_features.iloc[:, 1:])

# Calculate similarity matrix
similarity_matrix = cosine_similarity(scaled_features)

# Function to get top 3 lookalikes
def get_top_lookalikes(customer_id, similarity_matrix, customer_profiles, top_n=3):
    customer_index = customer_profiles[customer_profiles["CustomerID"] == customer_id].index[0]
    similarity_scores = list(enumerate(similarity_matrix[customer_index]))
    similarity_scores = sorted(similarity_scores, key=lambda x: x[1], reverse=True)[1:top_n+1]  # Exclude self
    lookalikes = [(customer_profiles.iloc[i]["CustomerID"], score) for i, score in similarity_scores]
    return lookalikes

# Generate lookalikes for the first 20 customers
lookalike_map = {}
for customer_id in customer_profiles["CustomerID"].iloc[:20]:
    lookalikes = get_top_lookalikes(customer_id, similarity_matrix, customer_profiles)
    lookalike_map[customer_id] = lookalikes

# Save to Lookalike.csv
with open("Lookalike.csv", "w") as file:
    file.write("CustomerID,LookalikeID,SimilarityScore\n")
    for cust_id, lookalikes in lookalike_map.items():
        for lookalike_id, score in lookalikes:
            file.write(f"{cust_id},{lookalike_id},{score:.4f}\n")

print("Lookalike.csv generated successfully!")

Lookalike.csv generated successfully!


In [4]:
#Top 3 lookalikes
df = pd.read_csv("Lookalike.csv")
df.head(3)

Unnamed: 0,CustomerID,LookalikeID,SimilarityScore
0,C0001,C0184,0.9983
1,C0001,C0048,0.9953
2,C0001,C0190,0.9906
