In [3]:
# Import necessary libraries
import pandas as pd
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.preprocessing import StandardScaler

# Load datasets
customers = pd.read_csv('Customers.csv')
products = pd.read_csv('Products.csv')
transactions = pd.read_csv('Transactions.csv')

# Merge datasets for a complete customer profile
merged_data = transactions.merge(customers, on='CustomerID', how='inner').merge(products, on='ProductID', how='inner')

# Preprocess data for similarity calculation
# 1. Aggregate transaction data at the customer level
customer_summary = merged_data.groupby('CustomerID').agg({
    'TotalValue': 'sum',       # Total transaction value
    'Quantity': 'sum',         # Total products purchased
    'Region': 'first',         # Region of the customer
}).reset_index()


# 2. Encode the 'Region' column using one-hot encoding

# Add average price (computed if necessary)
customer_summary['AvgPrice'] = customer_summary['TotalValue'] / customer_summary['Quantity']

# Proceed with encoding and similarity calculation as before
customer_summary = pd.get_dummies(customer_summary, columns=['Region'], drop_first=True)
# 3. Standardize the numerical columns
scaler = StandardScaler()
scaled_features = scaler.fit_transform(customer_summary.drop(['CustomerID'], axis=1))

# Compute cosine similarity between customers
similarity_matrix = cosine_similarity(scaled_features)

# Create a function to find the top 3 similar customers
def get_top_3_similar(customers, similarity_matrix, customer_index):
    # Get similarity scores for the given customer
    similarity_scores = similarity_matrix[customer_index]
    
    # Get the top 3 similar customers (excluding the customer itself)
    similar_customers = sorted(
        [(i, score) for i, score in enumerate(similarity_scores) if i != customer_index],
        key=lambda x: x[1],
        reverse=True
    )[:3]
    
    # Return customer IDs and similarity scores
    return [(customers['CustomerID'].iloc[i], score) for i, score in similar_customers]

# Generate the top 3 lookalikes for the first 20 customers (C0001 - C0020)
lookalike_results = {}
for idx in range(20):  # First 20 customers
    customer_id = customer_summary['CustomerID'].iloc[idx]
    similar_customers = get_top_3_similar(customer_summary, similarity_matrix, idx)
    lookalike_results[customer_id] = similar_customers

# Convert results into a DataFrame
lookalike_df = pd.DataFrame([
    {'CustomerID': cust_id, 'Lookalikes': lookalikes}
    for cust_id, lookalikes in lookalike_results.items()
])
lookalike_df['Lookalikes'] = lookalike_df['Lookalikes'].apply(
    lambda x: [{'CustomerID': cust[0], 'Score': round(cust[1], 2)} for cust in x]
)

# Save the lookalikes to a CSV file
lookalike_df.to_csv('Shrasti_Pandey_Lookalike.csv', index=False)

# Output sample lookalike data
print(lookalike_df.head(10))


  CustomerID                                         Lookalikes
0      C0001  [{'CustomerID': 'C0191', 'Score': 0.99}, {'Cus...
1      C0002  [{'CustomerID': 'C0142', 'Score': 0.99}, {'Cus...
2      C0003  [{'CustomerID': 'C0147', 'Score': 0.99}, {'Cus...
3      C0004  [{'CustomerID': 'C0113', 'Score': 0.99}, {'Cus...
4      C0005  [{'CustomerID': 'C0177', 'Score': 0.98}, {'Cus...
5      C0006  [{'CustomerID': 'C0126', 'Score': 0.99}, {'Cus...
6      C0007  [{'CustomerID': 'C0146', 'Score': 1.0}, {'Cust...
7      C0008  [{'CustomerID': 'C0018', 'Score': 0.94}, {'Cus...
8      C0009  [{'CustomerID': 'C0198', 'Score': 1.0}, {'Cust...
9      C0010  [{'CustomerID': 'C0121', 'Score': 0.97}, {'Cus...
