In [1]:
import pandas as pd
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity

# Load customer and product data
customer_data = pd.read_csv('Customers.csv')
products_data = pd.read_csv('Products.csv')

# Strip spaces and convert to uppercase for consistency
customer_data['CustomerID'] = customer_data['CustomerID'].str.strip().str.upper()
products_data['ProductID'] = products_data['ProductID'].str.strip().str.upper()

# Check for missing values in customer and product data
print(f"Missing values in customer data: {customer_data.isnull().sum()}")
print(f"Missing values in product data: {products_data.isnull().sum()}")

# One-hot encode the 'Region' column for customers
region_dummies = pd.get_dummies(customer_data['Region'], drop_first=True)

# One-hot encode the 'Category' column in products data
product_categories = pd.get_dummies(products_data['Category'], drop_first=True)

# Create a dummy customer-product interaction matrix based on product category preference
# Assume each customer likes a product based on the region or some category-based heuristic
# We'll create a product-category preference matrix for each customer

# Create a feature matrix for customers, assuming they like products based on category (using random preferences for now)
customer_preferences = pd.DataFrame(np.random.randint(0, 2, size=(len(customer_data), len(product_categories.columns))),
                                    columns=product_categories.columns)

# Normalize price (using maximum price as a factor for simplicity)
price_normalized = products_data['Price'] / products_data['Price'].max()

# Combine region dummies and customer preferences for customers
customer_features = pd.concat([region_dummies, customer_preferences], axis=1)

# Optionally, include normalized price info as a customer feature (can adjust based on a real heuristic)
customer_features['Price'] = np.random.choice(price_normalized, size=len(customer_data))

# Compute cosine similarity matrix between customers
cosine_sim = cosine_similarity(customer_features)

# Create a DataFrame for cosine similarity
cosine_sim_df = pd.DataFrame(cosine_sim, index=customer_data['CustomerID'], columns=customer_data['CustomerID'])

# Print cosine similarity matrix for reference
print(cosine_sim_df)

# Assign a similarity score to each recommended customer
top_n = 3  # We need top 3 lookalikes
recommendations = {}

for customer_id in customer_data['CustomerID']:
    # Get the similarity scores for each customer with the given customer_id
    sim_scores = cosine_sim_df[customer_id].sort_values(ascending=False).iloc[1:top_n+1]  # Excluding self
    
    # Store the recommended customers and their similarity scores
    recommendations[customer_id] = [(cust_id, sim_scores[cust_id]) for cust_id in sim_scores.index]

# Save the recommendations in the required format (Map<cust_id, List<cust_id, score>>)
recommendations_data = []
for customer_id, similar_customers in recommendations.items():
    for similar_customer, score in similar_customers:
        recommendations_data.append({'CustomerID': customer_id, 'LookalikeCustomerID': similar_customer, 'SimilarityScore': score})

# Create a DataFrame for Lookalike recommendations
lookalike_df = pd.DataFrame(recommendations_data)

# Save to Lookalike.csv
lookalike_df.to_csv('Lookalike.csv', index=False)

# Print first few lookalikes as confirmation
print(lookalike_df.head())


Missing values in customer data: CustomerID      0
CustomerName    0
Region          0
SignupDate      0
dtype: int64
Missing values in product data: ProductID      0
ProductName    0
Category       0
Price          0
dtype: int64
CustomerID     C0001     C0002     C0003     C0004     C0005     C0006  \
CustomerID                                                               
C0001       1.000000  0.721404  0.562896  0.736562  0.560983  0.733795   
C0002       0.721404  1.000000  0.088603  0.103962  0.651386  0.522646   
C0003       0.562896  0.088603  1.000000  0.769227  0.289974  0.770401   
C0004       0.736562  0.103962  0.769227  1.000000  0.340239  0.636243   
C0005       0.560983  0.651386  0.289974  0.340239  1.000000  0.802361   
...              ...       ...       ...       ...       ...       ...   
C0196       0.775778  0.720590  0.145660  0.457557  0.574718  0.461131   
C0197       0.770812  0.721675  0.119266  0.432527  0.541318  0.434332   
C0198       0.630434  0.44864