In [2]:
import pandas as pd
from sklearn.preprocessing import StandardScaler
from sklearn.metrics.pairwise import cosine_similarity

# Load the Customers and Transactions data
customers_df = pd.read_csv('Customers.csv')
transactions_df = pd.read_csv('Transactions.csv')

# Merge the customer and transaction data on CustomerID
merged_df = pd.merge(transactions_df, customers_df, on="CustomerID", how="inner")

# Select numerical features for similarity calculation
numerical_features = ['TotalValue', 'Quantity', 'Price']

# Fill NaN values with 0 in the numerical features before scaling
merged_df[numerical_features] = merged_df[numerical_features].fillna(0)

# Extract customer profiles (e.g., total spending, transaction quantity, etc.)
customer_profiles = merged_df.groupby('CustomerID')[numerical_features].sum()

# Standardize the numerical features (scale them)
scaler = StandardScaler()
customer_profiles[numerical_features] = scaler.fit_transform(customer_profiles[numerical_features])

# Calculate similarity (using cosine similarity)
similarity_matrix = cosine_similarity(customer_profiles[numerical_features])

# Create a DataFrame to hold similarity values
similarity_df = pd.DataFrame(similarity_matrix, index=customer_profiles.index, columns=customer_profiles.index)

# Find top 3 lookalikes for each customer (filtering only for CustomerID C0001 to C0020)
lookalike_results = {}

for customer in customer_profiles.index[:20]:  # For the first 20 customers
    # Sort the similarity values in descending order and exclude the customer themselves
    similar_customers = similarity_df.loc[customer].sort_values(ascending=False).iloc[1:4]
    lookalike_results[customer] = [(similar_customer, round(score, 4)) for similar_customer, score in zip(similar_customers.index, similar_customers.values)]

# Convert the lookalike results to a DataFrame
lookalike_df = pd.DataFrame(list(lookalike_results.items()), columns=['CustomerID', 'Lookalikes'])

# Clean the 'Lookalikes' column to make the output more readable
lookalike_df['Lookalikes'] = lookalike_df['Lookalikes'].apply(lambda x: str([(cust, float(score)) for cust, score in x]))

# Save the results to a CSV file
lookalike_df.to_csv('Sujal_Chauhan_Lookalike.csv', index=False)

print("Lookalike model has been generated and saved to 'Sujal_Chauhan_Lookalike.csv'.")


Lookalike model has been generated and saved to 'Sujal_Chauhan_Lookalike.csv'.
