In [4]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

# Load datasets
customers = pd.read_csv("Customers.csv")
products = pd.read_csv("Products.csv")
transactions = pd.read_csv("Transactions.csv")

# Preprocessing Customers data
customers['SignupDate'] = pd.to_datetime(customers['SignupDate'])
customers['SignupYear'] = customers['SignupDate'].dt.year

# Aggregating Transactions data for Customer Profiles
customer_transactions = transactions.groupby('CustomerID').agg({
    'TotalValue': 'sum',
    'Quantity': 'sum',
    'ProductID': lambda x: ','.join(x),
}).reset_index()

# Merge customer data with aggregated transaction data
customer_profiles = pd.merge(customers, customer_transactions, on='CustomerID', how='left')

# Replace NaN values with empty strings for product data
customer_profiles['ProductID'] = customer_profiles['ProductID'].fillna('')

# Combine relevant features for similarity analysis
customer_profiles['ProfileData'] = (
    customer_profiles['Region'] + ' ' +
    customer_profiles['ProductID'] + ' ' +
    customer_profiles['SignupYear'].astype(str)
)

# Vectorizing the profile data using TF-IDF
vectorizer = TfidfVectorizer()
profile_matrix = vectorizer.fit_transform(customer_profiles['ProfileData'])

# Compute pairwise cosine similarity
similarity_matrix = cosine_similarity(profile_matrix)

# Generate recommendations for the first 20 customers
lookalike_results = {}

for idx, customer_id in enumerate(customer_profiles['CustomerID'][:20]):
    # Get similarity scores for this customer
    similarity_scores = list(enumerate(similarity_matrix[idx]))
    # Sort by similarity scores (excluding self-match)
    similarity_scores = sorted(similarity_scores, key=lambda x: x[1], reverse=True)
    similar_customers = [
        (customer_profiles.iloc[i]['CustomerID'], score)
        for i, score in similarity_scores[1:4]  # Top 3 matches
    ]
    # Store results in the lookalike map
    lookalike_results[customer_id] = similar_customers

# Save results to Lookalike.csv
lookalike_df = pd.DataFrame([
    {
        'CustomerID': cust_id,
        'Lookalikes': str([{'CustomerID': match[0], 'Score': match[1]} for match in matches])
    }
    for cust_id, matches in lookalike_results.items()
])

lookalike_df.to_csv('Vineet_Pundpal_Lookalike.csv', index=False)

# Display the lookalike results for validation
print(lookalike_df)


PermissionError: [Errno 13] Permission denied: 'FirstName_LastName_Lookalike.csv'