In [12]:
import pandas as pd
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.preprocessing import OneHotEncoder, MinMaxScaler
from datetime import datetime

In [13]:
# Load datasets
customers = pd.read_csv("Customers.csv")
products = pd.read_csv("Products.csv")
transactions = pd.read_csv("Transactions.csv")


In [14]:
# Extract the brand name from ProductName
products["Brand"] = products["ProductName"].apply(lambda x: x.split()[0] if isinstance(x, str) else "Unknown")

# Merge Transactions and Products datasets
transactions_products = transactions.merge(products, on="ProductID", how="left")

# Rename Price columns for clarity
transactions_products = transactions_products.rename(columns={"Price_x": "TransactionPrice", "Price_y": "ProductPrice"})

In [15]:
# Aggregate customer transaction data
customer_data = transactions_products.groupby("CustomerID").agg({
    "TotalValue": "sum",                 # Total revenue per customer
    "TransactionID": "count",           # Transaction frequency
    "Category": lambda x: x.mode()[0],  # Most purchased category
    "ProductPrice": "mean",             # Average price of purchased products
    "Brand": lambda x: x.mode()[0],     # Most frequently purchased brand
}).reset_index()


In [16]:

# Merge with customer demographic data
customer_data = customer_data.merge(customers, on="CustomerID", how="left")

In [17]:


# Calculate average transactions per year
current_date = datetime.now()
customer_data["SignupDate"] = pd.to_datetime(customer_data["SignupDate"])
customer_data["YearsAsCustomer"] = (current_date - customer_data["SignupDate"]).dt.days / 365.25
customer_data["AvgTransactionsPerYear"] = customer_data["TransactionID"] / customer_data["YearsAsCustomer"]

In [18]:
# Encode categorical features (Region, Category, and Brand)
encoder = OneHotEncoder(sparse_output=False, handle_unknown="ignore")
encoded_region = pd.DataFrame(
    encoder.fit_transform(customer_data[["Region"]]),
    columns=encoder.get_feature_names_out(["Region"])
)
encoded_category = pd.DataFrame(
    encoder.fit_transform(customer_data[["Category"]]),
    columns=encoder.get_feature_names_out(["Category"])
)
encoded_brand = pd.DataFrame(
    encoder.fit_transform(customer_data[["Brand"]]),
    columns=encoder.get_feature_names_out(["Brand"])
)

  if is_sparse(pd_dtype):
  if is_sparse(pd_dtype) or not is_extension_array_dtype(pd_dtype):
  if is_sparse(pd_dtype):
  if is_sparse(pd_dtype) or not is_extension_array_dtype(pd_dtype):
  if is_sparse(pd_dtype):
  if is_sparse(pd_dtype) or not is_extension_array_dtype(pd_dtype):
  if is_sparse(pd_dtype):
  if is_sparse(pd_dtype) or not is_extension_array_dtype(pd_dtype):
  if is_sparse(pd_dtype):
  if is_sparse(pd_dtype) or not is_extension_array_dtype(pd_dtype):
  if is_sparse(pd_dtype):
  if is_sparse(pd_dtype) or not is_extension_array_dtype(pd_dtype):


In [19]:
# Combine features into a single dataset
features = pd.concat(
    [customer_data[["TotalValue", "TransactionID", "ProductPrice", "AvgTransactionsPerYear"]], encoded_region, encoded_category, encoded_brand],
    axis=1
)


In [21]:
# Normalize features for similarity calculation
scaler = MinMaxScaler()
features_scaled = scaler.fit_transform(features)

# Calculate similarity scores between customers
similarity_matrix = cosine_similarity(features_scaled)
similarity_df = pd.DataFrame(similarity_matrix, index=customer_data["CustomerID"], columns=customer_data["CustomerID"])

# Generate top 3 lookalikes for CustomerID C0001 to C0020
top_lookalikes = {}
for customer_id in customer_data["CustomerID"][:20]:  # First 20 customers
    similar_customers = similarity_df[customer_id].sort_values(ascending=False).iloc[1:4]  # Top 3 lookalikes
    top_lookalikes[customer_id] = [(sim_id, round(sim_score, 4)) for sim_id, sim_score in similar_customers.items()]

# Save Lookalike.csv
lookalike_df = pd.DataFrame([
    {"cust_id": cust_id, "lookalikes": lookalikes}
    for cust_id, lookalikes in top_lookalikes.items()
])
lookalike_df.to_csv("Lookalike1.csv", index=False)

print("Lookalike.csv has been successfully created!")


Lookalike.csv has been successfully created!


  if not hasattr(array, "sparse") and array.dtypes.apply(is_sparse).any():
  if is_sparse(pd_dtype):
  if is_sparse(pd_dtype) or not is_extension_array_dtype(pd_dtype):
  if not hasattr(array, "sparse") and array.dtypes.apply(is_sparse).any():
  if is_sparse(pd_dtype):
  if is_sparse(pd_dtype) or not is_extension_array_dtype(pd_dtype):
