In [3]:
# Import necessary libraries
import pandas as pd
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np

# Loading the datasets
customers = pd.read_csv('Customers.csv')
products = pd.read_csv('Products.csv')
transactions = pd.read_csv('Transactions.csv')

# Feature engineering example
customer_transactions = transactions.groupby('CustomerID').agg({'TotalValue': 'sum', 'Quantity': 'sum'}).reset_index()
customer_data = pd.merge(customers, customer_transactions, on='CustomerID', how='left').fillna(0)

# Preprocessing pipeline for numerical and categorical features
numerical_features = ['TotalValue', 'Quantity']
categorical_features = ['Region']

numerical_transformer = StandardScaler()
categorical_transformer = OneHotEncoder()

preprocessor = ColumnTransformer(
    transformers=[
        ('num', numerical_transformer, numerical_features),
        ('cat', categorical_transformer, categorical_features)
    ])

# Applying the preprocessing pipeline to the data
X = customer_data[numerical_features + categorical_features]
X_preprocessed = preprocessor.fit_transform(X)

# Calculating similarity
similarity_matrix = cosine_similarity(X_preprocessed)

# Finding lookalikes
def find_lookalikes(customer_id, n=3):
    idx = customers[customers['CustomerID'] == customer_id].index[0]
    sim_scores = list(enumerate(similarity_matrix[idx]))
    sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)
    top_n = sim_scores[1:n+1]
    return [(customers.iloc[i[0]]['CustomerID'], i[1]) for i in top_n]

# Generating Lookalike.csv
lookalike_map = {}
for cust_id in customers['CustomerID'][:20]:
    lookalike_map[cust_id] = find_lookalikes(cust_id)

# Flattening the lookalike_map into a list for DataFrame creation
lookalike_list = []
for cust_id, lookalikes in lookalike_map.items():
    entry = [cust_id]
    for lookalike in lookalikes:
        entry.extend(lookalike)
    lookalike_list.append(entry)

# Creating a DataFrame with the correct structure
lookalike_df = pd.DataFrame(lookalike_list, columns=['CustomerID', 'Lookalike1', 'Score1', 'Lookalike2', 'Score2', 'Lookalike3', 'Score3'])
lookalike_df.to_csv('Lookalike.csv', index=False)

print("Lookalike.csv generated successfully!")


Lookalike.csv generated successfully!
