In [28]:
import pandas as pd
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.feature_extraction.text import TfidfVectorizer

# Load the datasets
customers_df = pd.read_csv("Customers.csv")
products_df = pd.read_csv("Products.csv")
transactions_df = pd.read_csv("Transactions.csv")

# Merge the datasets for analysis
merged_df = transactions_df.merge(customers_df, on='CustomerID', how='left')
merged_df = merged_df.merge(products_df, on='ProductID', how='left')

# Create a customer-product matrix (each customer and their purchased products)
customer_product_matrix = merged_df.pivot_table(
    index='CustomerID', columns='ProductID', values='Quantity', fill_value=0
)

# Create a customer-category matrix (each customer and the product categories they purchased)
customer_category_matrix = merged_df.groupby(['CustomerID', 'Category'])['Quantity'].sum().unstack(fill_value=0)

# Combine the customer-product and customer-category matrices
customer_features = pd.concat([customer_product_matrix, customer_category_matrix], axis=1)

# Convert customer features to string format for TF-IDF Vectorizer
# Apply lambda to combine all customer features into one string per customer
tfidf_vectorizer = TfidfVectorizer()
tfidf_matrix = tfidf_vectorizer.fit_transform(customer_features.apply(lambda x: ' '.join(x.astype(str)), axis=1))

# Calculate cosine similarity between customers
cosine_sim = cosine_similarity(tfidf_matrix)

# Function to find top n lookalike customers for a given customer based on cosine similarity
def find_lookalikes(customer_id, cosine_sim, n_lookalikes=3):
    """
    Finds the top n_lookalikes for a given customer based on cosine similarity.

    Args:
        customer_id: The ID of the customer to find lookalikes for.
        cosine_sim: The cosine similarity matrix.
        n_lookalikes: The number of lookalikes to find.

    Returns:
        A list of tuples, each containing the lookalike customer ID and their similarity score.
    """
    idx = customers_df[customers_df['CustomerID'] == customer_id].index[0]  # Find the index of the customer
    sim_scores = list(enumerate(cosine_sim[idx]))  # Get the similarity scores for the customer
    sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)  # Sort by similarity score in descending order
    lookalikes = sim_scores[1:n_lookalikes + 1]  # Exclude the customer itself (index 0)
    return lookalikes

# Find lookalikes for the first 20 customers (C0001 - C0020)
lookalike_customers = {}
for customer_id in customers_df['CustomerID'][:20]:  # Iterate over the first 20 customers
    lookalikes = find_lookalikes(customer_id, cosine_sim)  # Find the lookalikes for each customer
    lookalike_customers[customer_id] = [
        {'CustomerID': customers_df.iloc[lookalike[0]]['CustomerID'], 'Similarity': lookalike[1]}
        for lookalike in lookalikes
    ]

# Create a DataFrame to store the results
lookalike_df = pd.DataFrame.from_dict(lookalike_customers, orient='index')
lookalike_df.index.name = 'CustomerID'
lookalike_df.columns = ['Lookalike_' + str(i) for i in range(1, 4)]

# Save the results to a CSV file
lookalike_df.to_csv("Lookalike.csv")

# Print the results for the first 5 customers
print(lookalike_df.head())


                                           Lookalike_1  \
CustomerID                                               
C0001       {'CustomerID': 'C0002', 'Similarity': 0.0}   
C0002       {'CustomerID': 'C0002', 'Similarity': 0.0}   
C0003       {'CustomerID': 'C0002', 'Similarity': 0.0}   
C0004       {'CustomerID': 'C0002', 'Similarity': 0.0}   
C0005       {'CustomerID': 'C0002', 'Similarity': 0.0}   

                                           Lookalike_2  \
CustomerID                                               
C0001       {'CustomerID': 'C0003', 'Similarity': 0.0}   
C0002       {'CustomerID': 'C0003', 'Similarity': 0.0}   
C0003       {'CustomerID': 'C0003', 'Similarity': 0.0}   
C0004       {'CustomerID': 'C0003', 'Similarity': 0.0}   
C0005       {'CustomerID': 'C0003', 'Similarity': 0.0}   

                                           Lookalike_3  
CustomerID                                              
C0001       {'CustomerID': 'C0004', 'Similarity': 0.0}  
C0002       {'C