In [12]:
import pandas as pd
from sklearn.preprocessing import MultiLabelBinarizer
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np
from google.colab import files


uploaded = files.upload() 


customers_df = pd.read_csv("Customers.csv")
products_df = pd.read_csv("Products.csv")
transactions_df = pd.read_csv("Transactions.csv")


merged_df = (
    transactions_df.merge(customers_df, on="CustomerID", how="inner")
                   .merge(products_df, on="ProductID", how="inner")
)

print(merged_df.columns)


customer_features = merged_df.groupby("CustomerID").agg({
    "Region": "first",  
    "SignupDate": "first",  
    "Category": lambda x: list(x),  
    "Price_x": "mean" if "Price_x" in merged_df.columns else lambda x: np.nan  
}).reset_index()


mlb = MultiLabelBinarizer()
category_encoded = mlb.fit_transform(customer_features["Category"])

category_encoded_df = pd.DataFrame(category_encoded, columns=mlb.classes_)
customer_features = pd.concat([customer_features, category_encoded_df], axis=1)
customer_features = customer_features.drop(columns=["Category"])


customer_features["SignupDate"] = pd.to_datetime(customer_features["SignupDate"])
customer_features["DaysSinceSignup"] = (pd.Timestamp.now() - customer_features["SignupDate"]).dt.days



final_features = customer_features.drop(columns=["CustomerID", "SignupDate", "Region"])


final_features = final_features.dropna()


final_features["Price_x"] = (final_features["Price_x"] - final_features["Price_x"].mean()) / final_features["Price_x"].std() if final_features["Price_x"].notna().any() else final_features["Price_x"]
final_features["DaysSinceSignup"] = (final_features["DaysSinceSignup"] - final_features["DaysSinceSignup"].mean()) / final_features["DaysSinceSignup"].std()


similarity_matrix = cosine_similarity(final_features)


customer_ids = customer_features["CustomerID"].tolist()
lookalike_data = {}

def get_top_similar_customers(customer_id, similarity_matrix, customer_ids):
    idx = customer_ids.index(customer_id)
    similarity_scores = list(enumerate(similarity_matrix[idx]))
    similarity_scores = sorted(similarity_scores, key=lambda x: x[1], reverse=True)
    top_similar = [(customer_ids[i], round(score, 4)) for i, score in similarity_scores[1:4]]  
    return top_similar


for customer_id in customer_ids[:20]:
    lookalike_data[customer_id] = get_top_similar_customers(customer_id, similarity_matrix, customer_ids)


lookalike_list = [
    {"CustomerID": cust_id, "Lookalikes": str(lookalike_data[cust_id])}
    for cust_id in lookalike_data
]

lookalike_df = pd.DataFrame(lookalike_list)


lookalike_df.to_csv("Lookalike.csv", index=False)


files.download("Lookalike.csv")

print("Lookalike.csv has been generated and is ready for download!")



Saving Customers.csv to Customers (7).csv
Saving Products.csv to Products (8).csv
Saving Transactions.csv to Transactions (7).csv
Index(['TransactionID', 'CustomerID', 'ProductID', 'TransactionDate',
       'Quantity', 'TotalValue', 'Price_x', 'CustomerName', 'Region',
       'SignupDate', 'ProductName', 'Category', 'Price_y'],
      dtype='object')


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

Lookalike.csv has been generated and is ready for download!
