In [11]:
 import pandas as pd
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.preprocessing import StandardScaler, OneHotEncoder
import warnings
warnings.filterwarnings('ignore')

# Create sample data
def create_sample_data():
    # Create Customers data
    customers_data = {
        'CustomerID': [f'C{str(i).zfill(4)}' for i in range(1, 101)],
        'Region': np.random.choice(['North', 'South', 'East', 'West'], 100)
    }
    customers_df = pd.DataFrame(customers_data)

    # Create Products data
    products_data = {
        'ProductID': [f'P{str(i).zfill(4)}' for i in range(1, 21)],
        'Category': np.random.choice(['Electronics', 'Clothing', 'Books', 'Home'], 20)
    }
    products_df = pd.DataFrame(products_data)

    # Create Transactions data
    transactions = []
    for i in range(1000):
        transactions.append({
            'TransactionID': f'T{str(i).zfill(4)}',
            'CustomerID': np.random.choice(customers_df['CustomerID']),
            'ProductID': np.random.choice(products_df['ProductID']),
            'TotalValue': np.random.uniform(10, 1000, 1)[0]
        })
    transactions_df = pd.DataFrame(transactions)
    
    return customers_df, products_df, transactions_df

# Load or create data
print("Creating sample data...")
customers_df, products_df, transactions_df = create_sample_data()

print("Data created successfully!")
print(f"Customers shape: {customers_df.shape}")
print(f"Products shape: {products_df.shape}")
print(f"Transactions shape: {transactions_df.shape}")

# Create transaction summary
print("\nCreating transaction summary...")
transactions_summary = transactions_df.groupby("CustomerID").agg({
    "TotalValue": ["sum", "mean"],
    "TransactionID": "count"
}).reset_index()

# Flatten column names
transactions_summary.columns = ["CustomerID", "total_spent", "avg_spent", "num_transactions"]

# Create product preferences
print("Creating product preferences...")
product_transactions = pd.merge(transactions_df, products_df, on="ProductID")
customer_preferences = product_transactions.groupby(["CustomerID", "Category"]).agg({
    "TotalValue": "sum"
}).reset_index()

# Create pivot table for customer preferences
customer_preferences_pivot = customer_preferences.pivot(
    index="CustomerID",
    columns="Category",
    values="TotalValue"
).fillna(0)

# Normalize preferences (row-wise)
customer_preferences_pivot = customer_preferences_pivot.div(
    customer_preferences_pivot.sum(axis=1), axis=0
)

# Merge customer data with transaction summary
print("Merging customer data...")
customers_merged = pd.merge(
    customers_df,
    transactions_summary,
    on="CustomerID",
    how="left"
).fillna(0)

# Prepare features for similarity calculation
print("Preparing features...")

# Categorical features (Region)
encoder = OneHotEncoder(sparse=False)
region_encoded = encoder.fit_transform(customers_merged[["Region"]])
region_cols = [f"region_{cat}" for cat in encoder.categories_[0]]

# Scale numerical features
scaler = StandardScaler()
numerical_features = ["total_spent", "avg_spent", "num_transactions"]
scaled_numerical = scaler.fit_transform(customers_merged[numerical_features])

# Align customer preferences with the customer data (handle missing categories)
final_customer_preferences = customer_preferences_pivot.reindex(customers_merged["CustomerID"]).fillna(0)

# Combine all features: region + scaled numerical features + customer preferences
final_features = np.hstack([
    region_encoded,
    scaled_numerical,
    final_customer_preferences.values
])

# Calculate similarity matrix
print("Calculating similarity matrix...")
similarity_matrix = cosine_similarity(final_features)

# Function to get top similar customers
def get_top_similar(similarity_matrix, customer_index, customer_ids, top_n=3):
    similarity_scores = similarity_matrix[customer_index]
    similarity_scores[customer_index] = -1  # Exclude self-similarity
    top_indices = similarity_scores.argsort()[::-1][:top_n]
    return [(customer_ids[i], similarity_scores[i]) for i in top_indices]

# Generate recommendations for first 20 customers
print("Generating recommendations...")
lookalike_results = []
customer_ids = customers_merged["CustomerID"].tolist()

for i in range(min(20, len(customer_ids))):
    customer_id = customer_ids[i]
    similar_customers = get_top_similar(similarity_matrix, i, customer_ids)
    row = [customer_id] + [item for sublist in similar_customers for item in sublist]
    lookalike_results.append(row)

# Create and save results
columns = ["cust_id", "lookalike_1", "score_1", "lookalike_2", "score_2", "lookalike_3", "score_3"]
lookalike_df = pd.DataFrame(lookalike_results, columns=columns)

# Save results
print("Saving results...")
lookalike_df.to_csv("Lookalike.csv", index=False)
print("\nResults saved to Lookalike.csv")

# Display results
print("\nLookalike Results (First 5 rows):")
print(lookalike_df.head())

# Print some statistics
print("\nSummary Statistics:")
print(f"Total number of recommendations generated: {len(lookalike_df)}")
print(f"Average similarity score: {lookalike_df[['score_1', 'score_2', 'score_3']].mean().mean():.3f}")

# Display example insights
print("\nExample Insights:")
print("1. Top 3 most similar pairs:")
top_pairs = lookalike_df.nlargest(3, 'score_1')[['cust_id', 'lookalike_1', 'score_1']]
print(top_pairs)

print("\n2. Distribution of similarity scores:")
all_scores = pd.concat([
    lookalike_df['score_1'],
    lookalike_df['score_2'],
    lookalike_df['score_3']
])
print(all_scores.describe())

print("\nProcess completed successfully!")


Creating sample data...
Data created successfully!
Customers shape: (100, 2)
Products shape: (20, 2)
Transactions shape: (1000, 4)

Creating transaction summary...
Creating product preferences...
Merging customer data...
Preparing features...
Calculating similarity matrix...
Generating recommendations...
Saving results...

Results saved to Lookalike.csv

Lookalike Results (First 5 rows):
  cust_id lookalike_1   score_1 lookalike_2   score_2 lookalike_3   score_3
0   C0001       C0044  0.985900       C0049  0.969533       C0047  0.890907
1   C0002       C0093  0.938874       C0037  0.936726       C0056  0.922008
2   C0003       C0076  0.971569       C0043  0.928285       C0035  0.883669
3   C0004       C0091  0.984061       C0086  0.944462       C0010  0.922448
4   C0005       C0084  0.928950       C0061  0.920700       C0022  0.902655

Summary Statistics:
Total number of recommendations generated: 20
Average similarity score: 0.925

Example Insights:
1. Top 3 most similar pairs:
   cus