In [40]:
import pandas as pd
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.preprocessing import MinMaxScaler

# Load the data
customers = pd.read_csv("Customers.csv")
products = pd.read_csv("Products.csv")
transactions = pd.read_csv("Transactions.csv")

## Feature Engineering: Aggregate transaction-level data for customer profiles

In [41]:
# Merge transaction data with customer and product information
data = transactions.merge(customers, on="CustomerID", how="left")
data = data.merge(products, on="ProductID", how="left")

# Feature Engineering: Aggregate transaction-level data for customer profiles
customer_features = data.groupby('CustomerID').agg({
    'Region': lambda x: x.mode()[0],  # Region as the most common value
    'Quantity': 'sum',               # Total quantity purchased
    'TotalValue': 'sum',             # Total revenue generated
    'Category': lambda x: x.value_counts().index[0]  # Most purchased category
}).reset_index()

# One-Hot Encode categorical features (Region and Category)
customer_features = pd.get_dummies(customer_features, columns=['Region', 'Category'], drop_first=True)

# Normalize numerical features for similarity calculations
scaler = MinMaxScaler()
customer_features.iloc[:, 1:] = scaler.fit_transform(customer_features.iloc[:, 1:])

# Build a similarity matrix using cosine similarity
similarity_matrix = cosine_similarity(customer_features.iloc[:, 1:])
similarity_df = pd.DataFrame(similarity_matrix, index=customer_features['CustomerID'], columns=customer_features['CustomerID'])


In [43]:
# Generate lookalikes for the first 20 customers
lookalike_map = {}

for cust_id in customer_features['CustomerID'][:20]:
    # Exclude the current customer and get the top 3 most similar customers
    similar_customers = similarity_df[cust_id].sort_values(ascending=False).drop(cust_id).head(3)
    lookalike_map[cust_id] = list(similar_customers.items())

# Convert to the required format for Lookalike.csv
lookalike_results = []

for cust_id, recommendations in lookalike_map.items():
    for rec_id, score in recommendations:
        lookalike_results.append({
            "cust_id": cust_id,
            "recommended_cust_id": rec_id,
            "similarity_score": score
        })

lookalike_df = pd.DataFrame(lookalike_results)
lookalike_df.to_csv("SampleLookalike.csv", index=False)

print("Lookalike Model Completed. Results saved to SampleLookalike.csv")


Lookalike Model Completed. Results saved to SampleLookalike.csv


## Apply KMeans clustering to customer features

In [44]:
from sklearn.cluster import KMeans
from sklearn.metrics import silhouette_score

# Apply KMeans clustering to customer features
kmeans = KMeans(n_clusters=5, random_state=42)
customer_features['Cluster'] = kmeans.fit_predict(customer_features.iloc[:, 1:])

# Calculate Silhouette Score
silhouette_avg = silhouette_score(customer_features.iloc[:, 1:], customer_features['Cluster'])
print(f"Silhouette Score for Clusters: {silhouette_avg}")

Silhouette Score for Clusters: 0.48869615016567886




In [45]:
for cust_id, recommendations in lookalike_map.items():
    actual_distances = []
    for rec_id, score in recommendations:
        # Get features for current and recommended customers
        cust_features = customer_features[customer_features['CustomerID'] == cust_id].iloc[:, 1:].values
        rec_features = customer_features[customer_features['CustomerID'] == rec_id].iloc[:, 1:].values

        # Compute actual cosine similarity
        actual_distance = cosine_similarity(cust_features, rec_features)[0][0]
        actual_distances.append(actual_distance)

    # Display results for consistency
    print(f"Customer {cust_id} Recommendations: {recommendations}")
    print(f"Actual Similarities: {actual_distances}")

Customer C0001 Recommendations: [('C0107', 0.9998616430777432), ('C0184', 0.9997715277163312), ('C0048', 0.9995332182193393)]
Actual Similarities: [0.9999030452138098, 0.9998405673491204, 0.9996722498510545]
Customer C0002 Recommendations: [('C0159', 0.9979999334827133), ('C0178', 0.9967115387634209), ('C0110', 0.9798616414064173)]
Actual Similarities: [0.9995567685983444, 0.9992763915439784, 0.9956010888554828]
Customer C0003 Recommendations: [('C0181', 0.9990998973903307), ('C0133', 0.9979316325806195), ('C0076', 0.9977575893729745)]
Actual Similarities: [0.999356198124478, 0.9985542069281221, 0.9984157165167075]
Customer C0004 Recommendations: [('C0169', 0.9971583414867403), ('C0165', 0.9914252776551769), ('C0153', 0.9913886553298505)]
Actual Similarities: [0.9978038040040303, 0.9924557362020743, 0.9937287170628568]
Customer C0005 Recommendations: [('C0186', 0.9996981348234925), ('C0146', 0.9984708703936827), ('C0007', 0.9984341247972212)]
Actual Similarities: [0.9996981348234925, 0

## Average Similarity Score for Top 3 Recommendations

In [46]:
mean_similarity_scores = []

for cust_id, recommendations in lookalike_map.items():
    scores = [score for _, score in recommendations]
    mean_similarity_scores.append(sum(scores) / len(scores))

average_similarity = sum(mean_similarity_scores) / len(mean_similarity_scores)
print(f"Average Similarity Score for Top-3 Recommendations: {average_similarity}")

Average Similarity Score for Top-3 Recommendations: 0.9937983398056056


In [47]:
import pandas as pd
from sklearn.cluster import KMeans
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.preprocessing import MinMaxScaler
from sklearn.metrics import silhouette_score

# Load the data
customers = pd.read_csv("Customers.csv")
products = pd.read_csv("Products.csv")
transactions = pd.read_csv("Transactions.csv")

# Merge transaction data with customer and product information
data = transactions.merge(customers, on="CustomerID", how="left")
data = data.merge(products, on="ProductID", how="left")

# Ensure TransactionDate is in datetime format
data['TransactionDate'] = pd.to_datetime(data['TransactionDate'])


In [48]:
# Feature Engineering
customer_features = data.groupby('CustomerID').agg({
    'Region': lambda x: x.mode()[0],                     # Most common region
    'Quantity': 'sum',                                  # Total quantity purchased
    'TotalValue': 'sum',                                # Total revenue generated
    'Category': lambda x: x.nunique(),                  # Number of unique categories purchased
    'ProductID': lambda x: x.nunique(),                 # Number of unique products purchased
    'TransactionDate': lambda x: (data['TransactionDate'].max() - x.max()).days,  # Recency
}).reset_index()

# Rename columns for clarity
customer_features.rename(columns={
    'Category': 'CategoryDiversity',
    'ProductID': 'ProductDiversity',
    'TransactionDate': 'Recency'
}, inplace=True)

# One-hot encode the Region column
customer_features = pd.get_dummies(customer_features, columns=['Region'], drop_first=True)

# Normalize numerical features
scaler = MinMaxScaler()
customer_features.iloc[:, 1:] = scaler.fit_transform(customer_features.iloc[:, 1:])


## Silhouette Score

In [49]:
# Apply K-Means Clustering
optimal_k = 5  # Determined using domain knowledge or the elbow method
kmeans = KMeans(n_clusters=optimal_k, random_state=42)
customer_features['Cluster'] = kmeans.fit_predict(customer_features.iloc[:, 1:])

# Calculate Silhouette Score
silhouette_avg = silhouette_score(customer_features.iloc[:, 1:], customer_features['Cluster'])
print(f"Silhouette Score: {silhouette_avg:.4f}")

# Build Similarity Matrix
similarity_matrix = cosine_similarity(customer_features.iloc[:, 1:-1])  # Exclude the Cluster column
similarity_df = pd.DataFrame(similarity_matrix, index=customer_features['CustomerID'], columns=customer_features['CustomerID'])


Silhouette Score: 0.6405




## Lookalike Model for First 20 Customers

## Average Similarity Score for Top-3 Recommendations 

In [50]:
import pandas as pd

# Generate Lookalikes
lookalike_map = {}
for cust_id in customer_features['CustomerID']:
    # Get the top 3 most similar customers, excluding the customer themselves
    similar_customers = similarity_df[cust_id].sort_values(ascending=False).drop(cust_id).head(3)
    lookalike_map[cust_id] = [(rec_id, round(score, 4)) for rec_id, score in similar_customers.items()]

# Limit to the first 20 customers
top_20_customers = list(customer_features['CustomerID'])[:20]
lookalike_results = []
for cust_id in top_20_customers:
    recommendations = lookalike_map.get(cust_id, [])
    lookalike_results.append({
        "cust_id": cust_id + "  :  ",
        "Recommendations": recommendations  # Store list of tuples directly
    })

# Create a DataFrame and save results to CSV
lookalike_df = pd.DataFrame(lookalike_results)
lookalike_df.to_csv("Vaishnavi_Anugu_Lookalike.csv", index=False, sep='\t')
print("Lookalike Model for Top 20 Customers Completed. Results saved to Vaishnavi_Anugu_Lookalike.csv")

# Analyze Mean Similarity Scores
mean_similarity_scores = []
for cust_id in top_20_customers:
    recommendations = lookalike_map.get(cust_id, [])
    scores = [score for _, score in recommendations]
    if scores:  # Avoid empty lists
        mean_similarity_scores.append(sum(scores) / len(scores))

average_similarity = sum(mean_similarity_scores) / len(mean_similarity_scores)
print(f"Average Similarity Score for Top-3 Recommendations (Top 20 Customers): {average_similarity:.4f}")


Lookalike Model for Top 20 Customers Completed. Results saved to Vaishnavi_Anugu_Lookalike.csv
Average Similarity Score for Top-3 Recommendations (Top 20 Customers): 0.9838
