In [1]:
import pandas as pd
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.metrics.pairwise import cosine_similarity

In [2]:
# Load datasets
customers = pd.read_csv("/content/drive/MyDrive/Classroom/Customers.csv")
products = pd.read_csv("/content/drive/MyDrive/Classroom/Products.csv")
transactions = pd.read_csv("/content/drive/MyDrive/Classroom/Transactions.csv")

In [3]:
# Merge datasets
merged_data = transactions.merge(customers, on="CustomerID").merge(products, on="ProductID")


 Feature Engineering

In [4]:
# Customer-level aggregation
customer_features = merged_data.groupby("CustomerID").agg({
    "TotalValue": "sum",  # Total spending
    "Quantity": "sum",   # Total quantity purchased
    "Category": lambda x: x.mode()[0],  # Most purchased category
}).reset_index()

In [5]:
# Merge with customer profile data
customer_features = customer_features.merge(customers, on="CustomerID")

In [6]:
# One-hot encoding for 'Region' and 'Category'
encoder = OneHotEncoder()
encoded_features = encoder.fit_transform(customer_features[["Region", "Category"]]).toarray()
encoded_feature_names = encoder.get_feature_names_out(["Region", "Category"])
encoded_df = pd.DataFrame(encoded_features, columns=encoded_feature_names)

In [7]:
# Normalize numerical features
scaler = StandardScaler()
numerical_features = customer_features[["TotalValue", "Quantity"]]
normalized_numerical = scaler.fit_transform(numerical_features)
normalized_df = pd.DataFrame(normalized_numerical, columns=["TotalValue_scaled", "Quantity_scaled"])

In [8]:
# Combine all features
final_features = pd.concat([encoded_df, normalized_df], axis=1)

In [9]:
# Compute similarity
similarity_matrix = cosine_similarity(final_features)

In [10]:
# Create lookalike recommendations
lookalike_recommendations = {}
customer_ids = customer_features["CustomerID"].tolist()

for i, customer_id in enumerate(customer_ids[:20]):  # For first 20 customers
    similarity_scores = list(enumerate(similarity_matrix[i]))
    sorted_scores = sorted(similarity_scores, key=lambda x: x[1], reverse=True)[1:4]  # Top 3 similar customers
    lookalike_recommendations[customer_id] = [(customer_ids[j], round(score, 4)) for j, score in sorted_scores]

In [11]:
# Save recommendations to CSV
lookalike_df = pd.DataFrame([
    {"cust_id": cust_id, "lookalikes": str(lookalike_recommendations[cust_id])}
    for cust_id in lookalike_recommendations
])
lookalike_df.to_csv("Vechalapu_Tejaswini_Lookalike.csv", index=False)