In [1]:
import pandas as pd
from sklearn.preprocessing import MinMaxScaler
from sklearn.metrics.pairwise import cosine_similarity

In [2]:
# Load datasets
customers = pd.read_csv('Customers.csv')
products = pd.read_csv('Products.csv')
transactions = pd.read_csv('Transactions.csv')


In [3]:

# Merge datasets for feature engineering
merged = transactions.merge(customers, on='CustomerID').merge(products, on='ProductID')


In [5]:
# Verify column names
print(merged.columns)

# Adjusting feature engineering if 'Price' column is missing
customer_features = merged.groupby('CustomerID').agg({
    'TotalValue': 'sum',  # Total revenue from transactions
    'Quantity': 'sum',    # Total quantity purchased
    'ProductID': 'count',  # Number of distinct products purchased
    'Category': lambda x: x.mode()[0],  # Most purchased product category
    'Region': 'first',     # Customer's region
    'SignupDate': 'first'  # Customer's signup date
}).reset_index()


Index(['TransactionID', 'CustomerID', 'ProductID', 'TransactionDate',
       'Quantity', 'TotalValue', 'Price_x', 'CustomerName', 'Region',
       'SignupDate', 'ProductName', 'Category', 'Price_y'],
      dtype='object')


In [6]:

# Encode categorical variables
customer_features = pd.get_dummies(customer_features, columns=['Region', 'Category'], drop_first=True)


In [8]:
# Check columns in customer_features
print("Columns in customer_features:", customer_features.columns)

# Adjust numeric features based on available columns
numeric_features = [col for col in ['TotalValue', 'Quantity', 'Price'] if col in customer_features.columns]

# Apply MinMaxScaler
scaler = MinMaxScaler()
customer_features[numeric_features] = scaler.fit_transform(customer_features[numeric_features])

print("Feature scaling completed successfully.")


Columns in customer_features: Index(['CustomerID', 'TotalValue', 'Quantity', 'ProductID', 'SignupDate',
       'Region_Europe', 'Region_North America', 'Region_South America',
       'Category_Clothing', 'Category_Electronics', 'Category_Home Decor'],
      dtype='object')
Feature scaling completed successfully.


In [10]:
# Check and select numeric columns
print("Customer Features Types:\n", customer_features.dtypes)
features_for_similarity = customer_features.select_dtypes(include=['float64', 'int64'])

# If needed, encode categorical columns
features_for_similarity = pd.get_dummies(customer_features.drop(columns=['SignupDate']), drop_first=True)

# Compute similarity matrix
similarity_matrix = cosine_similarity(features_for_similarity)
similarity_df = pd.DataFrame(similarity_matrix, index=customer_features['CustomerID'], columns=customer_features['CustomerID'])

print("Cosine similarity computed successfully.")


Customer Features Types:
 CustomerID               object
TotalValue              float64
Quantity                float64
ProductID                 int64
SignupDate               object
Region_Europe             uint8
Region_North America      uint8
Region_South America      uint8
Category_Clothing         uint8
Category_Electronics      uint8
Category_Home Decor       uint8
dtype: object
Cosine similarity computed successfully.


In [11]:
# Generate Lookalike Recommendations
lookalike_data = {}
for customer_id in customer_features['CustomerID'][:20]:  # First 20 customers
    similar_customers = similarity_df[customer_id].sort_values(ascending=False).iloc[1:4]
    lookalike_data[customer_id] = list(similar_customers.index), list(similar_customers.values)

In [12]:
# Save to Lookalike.csv
lookalike_results = []
for cust_id, (lookalikes, scores) in lookalike_data.items():
    for lookalike, score in zip(lookalikes, scores):
        lookalike_results.append({'cust_id': cust_id, 'lookalike': lookalike, 'score': score})

lookalike_df = pd.DataFrame(lookalike_results)
lookalike_df.to_csv('Lookalike.csv', index=False)

In [13]:
print("Lookalike.csv generated successfully.")

Lookalike.csv generated successfully.
