In [2]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler
from sklearn.metrics.pairwise import cosine_similarity


customers_df = pd.read_csv('Customers.csv')
transactions_df = pd.read_csv('Transactions.csv')
products_df = pd.read_csv('Products.csv')


trans_prod = pd.merge(transactions_df, products_df, on='ProductID')

def create_customer_features(customers_df, trans_prod):

    customer_features = customers_df.copy()


    transaction_features = trans_prod.groupby('CustomerID').agg({
        'TotalValue': ['sum', 'mean'],
        'Quantity': ['sum', 'mean'],
        'Category': lambda x: x.mode().iloc[0] if not x.empty else None,
        'TransactionID': 'count'
    }).reset_index()


    transaction_features.columns = ['CustomerID', 'total_value', 'avg_value',
                                  'total_quantity', 'avg_quantity',
                                  'favorite_category', 'transaction_count']


    final_features = pd.merge(customer_features, transaction_features, on='CustomerID')


    final_features['signup_days'] = (pd.to_datetime('now') -
                                   pd.to_datetime(final_features['SignupDate'])).dt.days

    return final_features

feature_df = create_customer_features(customers_df, trans_prod)


numeric_features = ['signup_days', 'total_value', 'avg_value',
                   'total_quantity', 'avg_quantity', 'transaction_count']
categorical_features = ['Region', 'favorite_category']


feature_matrix = pd.get_dummies(feature_df, columns=categorical_features)


scaler = StandardScaler()
feature_matrix[numeric_features] = scaler.fit_transform(feature_matrix[numeric_features])


similarity_matrix = cosine_similarity(feature_matrix.drop(['CustomerID', 'CustomerName', 'SignupDate'], axis=1))

def get_top_3_similar(customer_id, feature_df, similarity_matrix):
    customer_idx = feature_df[feature_df['CustomerID'] == customer_id].index[0]
    similarities = similarity_matrix[customer_idx]

    similar_indices = np.argsort(similarities)[::-1][1:4]

    similar_customers = []
    for idx in similar_indices:
        similar_customers.append({
            'customer_id': feature_df.iloc[idx]['CustomerID'],
            'similarity_score': round(similarities[idx], 3)
        })

    return similar_customers

recommendations = {}
for customer_id in customers_df['CustomerID'][:20]:
    recommendations[customer_id] = get_top_3_similar(customer_id, feature_df, similarity_matrix)


output_rows = []
for cust_id, similar_customers in recommendations.items():
    similar_list = [(sc['customer_id'], sc['similarity_score']) for sc in similar_customers]
    output_rows.append({
        'CustomerID': cust_id,
        'Lookalikes': str(similar_list)
    })

output_df = pd.DataFrame(output_rows)
output_df.to_csv('Lookalike.csv', index=False)
