In [1]:
# Import libraries
import pandas as pd
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.preprocessing import StandardScaler



In [2]:
# Load datasets
customers = pd.read_csv("/Users/apple/Desktop/Internship/Customers.csv")
products = pd.read_csv("/Users/apple/Desktop/Internship/Products.csv")
transactions = pd.read_csv("/Users/apple/Desktop/Internship/Transactions.csv")

In [3]:
# Merge transactions with products and customers
transactions_products = transactions.merge(products, on="ProductID")
merged_data = transactions_products.merge(customers, on="CustomerID")

In [4]:
# Pivot table to create customer-product preferences
customer_product_features = merged_data.pivot_table(
    index="CustomerID",
    columns="ProductID",
    values="Quantity",
    aggfunc="sum",
    fill_value=0
)

In [5]:
# Normalize customer demographic data (e.g., Region)
customer_profile = pd.get_dummies(customers.set_index("CustomerID"), columns=["Region"])


In [6]:
# Combine customer-product preferences with their profile
final_features = customer_product_features.merge(customer_profile, on="CustomerID")

In [7]:
# Ensure only numeric columns are included
numeric_features = final_features.select_dtypes(include=["number"])


In [8]:
# Scale the data
scaler = StandardScaler()
scaled_features = scaler.fit_transform(numeric_features)


In [9]:
# Compute cosine similarity
similarity_matrix = cosine_similarity(scaled_features)

In [10]:
# Convert similarity matrix to a DataFrame
similarity_df = pd.DataFrame(
    similarity_matrix,
    index=final_features.index,
    columns=final_features.index
)

In [11]:
# Function to generate top-N lookalikes for specific customers
def get_top_lookalikes(similarity_df, target_customers, top_n=3):
    lookalike_map = {}
    for customer_id in target_customers:
        if customer_id in similarity_df.index:
            # Sort by similarity scores (descending) and exclude the customer itself
            top_similar = similarity_df.loc[customer_id].sort_values(ascending=False).iloc[1:top_n + 1]
            lookalike_map[customer_id] = [
                {"SimilarCustomerID": sim_id, "Score": round(score, 3)}
                for sim_id, score in top_similar.items()
            ]
        else:
            # Handle missing customer IDs
            lookalike_map[customer_id] = []
    return lookalike_map

In [12]:
# Specify the first 20 customers (C0001 - C0020)
target_customers = [f"C{str(i).zfill(4)}" for i in range(1, 21)]

In [13]:
# Generate the lookalike map for the specific customers
lookalike_map = get_top_lookalikes(similarity_df, target_customers)

In [14]:
# Convert the lookalike map to the required Map format
lookalike_data = [
    {"CustomerID": customer_id, "Lookalikes": lookalikes}
    for customer_id, lookalikes in lookalike_map.items()
]

In [15]:
# Save the results to a CSV file
lookalike_df = pd.DataFrame(lookalike_data)
lookalike_df.to_csv("TonyDylin_Marneni_Lookalike.csv", index=False)

In [16]:
# Display the lookalike DataFrame
print(lookalike_df.head())

  CustomerID                                         Lookalikes
0      C0001  [{'SimilarCustomerID': 'C0194', 'Score': 0.405...
1      C0002  [{'SimilarCustomerID': 'C0030', 'Score': 0.405...
2      C0003  [{'SimilarCustomerID': 'C0181', 'Score': 0.478...
3      C0004  [{'SimilarCustomerID': 'C0070', 'Score': 0.352...
4      C0005  [{'SimilarCustomerID': 'C0096', 'Score': 0.487...
