# Import Required Libraries
Import necessary libraries such as pandas, numpy, matplotlib, seaborn, and scikit-learn.

In [1]:
# Import Required Libraries

import pandas as pd  # For data manipulation and analysis
import numpy as np  # For numerical operations
import matplotlib.pyplot as plt  # For data visualization
import seaborn as sns  # For statistical data visualization
from sklearn.model_selection import train_test_split  # For splitting data into training and testing sets
from sklearn.preprocessing import StandardScaler  # For feature scaling
from sklearn.cluster import KMeans  # For clustering
from sklearn.metrics import davies_bouldin_score  # For evaluating clustering quality
import warnings
warnings.filterwarnings('ignore')


# Load the Dataset
Load Customers.csv, Products.csv, and Transactions.csv into pandas DataFrames.

In [2]:
# Load the Dataset

# Load Customers.csv into a DataFrame
customers_df = pd.read_csv('Customers.csv')

# Load Products.csv into a DataFrame
products_df = pd.read_csv('Products.csv')

# Load Transactions.csv into a DataFrame
transactions_df = pd.read_csv('Transactions.csv')

# Display the first few rows of each DataFrame to verify the data
print(customers_df.head())
print(products_df.head())
print(transactions_df.head())

  CustomerID        CustomerName         Region  SignupDate
0      C0001    Lawrence Carroll  South America  2022-07-10
1      C0002      Elizabeth Lutz           Asia  2022-02-13
2      C0003      Michael Rivera  South America  2024-03-07
3      C0004  Kathleen Rodriguez  South America  2022-10-09
4      C0005         Laura Weber           Asia  2022-08-15
  ProductID              ProductName     Category   Price
0      P001     ActiveWear Biography        Books  169.30
1      P002    ActiveWear Smartwatch  Electronics  346.30
2      P003  ComfortLiving Biography        Books   44.12
3      P004            BookWorld Rug   Home Decor   95.69
4      P005          TechPro T-Shirt     Clothing  429.31
  TransactionID CustomerID ProductID      TransactionDate  Quantity  \
0        T00001      C0199      P067  2024-08-25 12:38:23         1   
1        T00112      C0146      P067  2024-05-27 22:23:54         1   
2        T00166      C0127      P067  2024-04-25 07:38:55         1   
3       

# Data Preprocessing
Handle missing values, data type conversions, and any necessary data cleaning steps.

In [3]:
# Data Preprocessing

# Handle missing values
customers_df.fillna({'CustomerName': 'Unknown', 'Region': 'Unknown'}, inplace=True)
products_df.fillna({'ProductName': 'Unknown', 'Category': 'Unknown', 'Price': 0}, inplace=True)
transactions_df.fillna({'Quantity': 0, 'TotalValue': 0, 'Price': 0}, inplace=True)

# Convert data types
customers_df['SignupDate'] = pd.to_datetime(customers_df['SignupDate'])
transactions_df['TransactionDate'] = pd.to_datetime(transactions_df['TransactionDate'])
products_df['Price'] = products_df['Price'].astype(float)
transactions_df['Price'] = transactions_df['Price'].astype(float)
transactions_df['TotalValue'] = transactions_df['TotalValue'].astype(float)
transactions_df['Quantity'] = transactions_df['Quantity'].astype(int)

# Verify data types
print(customers_df.dtypes)
print(products_df.dtypes)
print(transactions_df.dtypes)

# Additional data cleaning steps if necessary
# For example, removing duplicates
customers_df.drop_duplicates(subset='CustomerID', keep='first', inplace=True)
products_df.drop_duplicates(subset='ProductID', keep='first', inplace=True)
transactions_df.drop_duplicates(subset='TransactionID', keep='first', inplace=True)

# Display the first few rows of each DataFrame after preprocessing
print(customers_df.head())
print(products_df.head())
print(transactions_df.head())

CustomerID              object
CustomerName            object
Region                  object
SignupDate      datetime64[ns]
dtype: object
ProductID       object
ProductName     object
Category        object
Price          float64
dtype: object
TransactionID              object
CustomerID                 object
ProductID                  object
TransactionDate    datetime64[ns]
Quantity                    int64
TotalValue                float64
Price                     float64
dtype: object
  CustomerID        CustomerName         Region SignupDate
0      C0001    Lawrence Carroll  South America 2022-07-10
1      C0002      Elizabeth Lutz           Asia 2022-02-13
2      C0003      Michael Rivera  South America 2024-03-07
3      C0004  Kathleen Rodriguez  South America 2022-10-09
4      C0005         Laura Weber           Asia 2022-08-15
  ProductID              ProductName     Category   Price
0      P001     ActiveWear Biography        Books  169.30
1      P002    ActiveWear Smartwat

# Lookalike Model
Build a Lookalike Model that recommends 3 similar customers based on their profile and transaction history.

In [4]:
# Lookalike Model

from sklearn.neighbors import NearestNeighbors

# Analyze customer transaction behavior
customer_transactions = transactions_df.groupby('CustomerID').agg({
    'TransactionID': 'count',
    'TotalValue': 'sum',
    'Quantity': 'sum'
}).reset_index()

# Merge with customer data
customer_analysis_df = customers_df.merge(customer_transactions, on='CustomerID')
# Average transaction value per customer
customer_analysis_df['AvgTransactionValue'] = customer_analysis_df['TotalValue'] / customer_analysis_df['TransactionID']

# Prepare the data for the lookalike model
customer_features = customer_analysis_df[['CustomerID', 'Region', 'TotalValue', 'Quantity', 'AvgTransactionValue']]
customer_features = pd.get_dummies(customer_features, columns=['Region'])

# Fit the Nearest Neighbors model
nn_model = NearestNeighbors(n_neighbors=4, metric='euclidean')
nn_model.fit(customer_features.drop(columns=['CustomerID']))

# Function to find lookalikes for a given customer
def find_lookalikes(customer_id):
    customer_index = customer_features[customer_features['CustomerID'] == customer_id].index[0]
    distances, indices = nn_model.kneighbors([customer_features.drop(columns=['CustomerID']).iloc[customer_index]])
    lookalikes = customer_features.iloc[indices[0][1:]]
    lookalikes['SimilarityScore'] = 1 / (1 + distances[0][1:])
    return lookalikes[['CustomerID', 'SimilarityScore']]

# Generate lookalikes for the first 20 customers
lookalike_results = {}
for customer_id in customers_df['CustomerID'][:20]:
    lookalike_results[customer_id] = find_lookalikes(customer_id).values.tolist()

# Save the lookalike results to a CSV file
lookalike_df = pd.DataFrame.from_dict(lookalike_results, orient='index')
lookalike_df.to_csv('FirstName_LastName_Lookalike.csv', header=False)

# Feature Engineering
Create features from customer and product information for the Lookalike Model.

In [5]:
# Feature Engineering

# Create features from customer and product information for the Lookalike Model

# Calculate the recency of the last transaction for each customer
latest_transaction_date = transactions_df.groupby('CustomerID')['TransactionDate'].max().reset_index()
latest_transaction_date.columns = ['CustomerID', 'LastTransactionDate']
latest_transaction_date['Recency'] = (transactions_df['TransactionDate'].max() - latest_transaction_date['LastTransactionDate']).dt.days

# Calculate the frequency of transactions for each customer
transaction_frequency = transactions_df.groupby('CustomerID')['TransactionID'].count().reset_index()
transaction_frequency.columns = ['CustomerID', 'Frequency']

# Calculate the monetary value of transactions for each customer
transaction_monetary = transactions_df.groupby('CustomerID')['TotalValue'].sum().reset_index()
transaction_monetary.columns = ['CustomerID', 'Monetary']

# Merge the recency, frequency, and monetary features with the customer data
customer_features = customers_df.merge(latest_transaction_date[['CustomerID', 'Recency']], on='CustomerID')
customer_features = customer_features.merge(transaction_frequency[['CustomerID', 'Frequency']], on='CustomerID')
customer_features = customer_features.merge(transaction_monetary[['CustomerID', 'Monetary']], on='CustomerID')

# One-hot encode the 'Region' feature
customer_features = pd.get_dummies(customer_features, columns=['Region'])

# Normalize the numerical features
scaler = StandardScaler()
customer_features[['Recency', 'Frequency', 'Monetary']] = scaler.fit_transform(customer_features[['Recency', 'Frequency', 'Monetary']])

# Display the first few rows of the engineered features
print(customer_features.head())

  CustomerID        CustomerName SignupDate   Recency  Frequency  Monetary  \
0      C0001    Lawrence Carroll 2022-07-10 -0.266933  -0.011458 -0.061701   
1      C0002      Elizabeth Lutz 2022-02-13 -0.690872  -0.467494 -0.877744   
2      C0003      Michael Rivera 2024-03-07  0.722260  -0.467494 -0.405857   
3      C0004  Kathleen Rodriguez 2022-10-09 -0.987630   1.356650  1.032547   
4      C0005         Laura Weber 2022-08-15 -0.281064  -0.923530 -0.783929   

   Region_Asia  Region_Europe  Region_North America  Region_South America  
0        False          False                 False                  True  
1         True          False                 False                 False  
2        False          False                 False                  True  
3        False          False                 False                  True  
4         True          False                 False                 False  


# Model Development
Develop the Lookalike Model using an appropriate algorithm and calculate similarity scores.

In [6]:
# Model Development

from sklearn.neighbors import NearestNeighbors

# Prepare the data for the lookalike model
customer_features = customer_features[['CustomerID', 'Recency', 'Frequency', 'Monetary'] + [col for col in customer_features.columns if 'Region_' in col]]

# Fit the Nearest Neighbors model
nn_model = NearestNeighbors(n_neighbors=4, metric='euclidean')
nn_model.fit(customer_features.drop(columns=['CustomerID']))

# Function to find lookalikes for a given customer
def find_lookalikes(customer_id):
    customer_index = customer_features[customer_features['CustomerID'] == customer_id].index[0]
    distances, indices = nn_model.kneighbors([customer_features.drop(columns=['CustomerID']).iloc[customer_index]])
    lookalikes = customer_features.iloc[indices[0][1:]]
    lookalikes['SimilarityScore'] = 1 / (1 + distances[0][1:])
    return lookalikes[['CustomerID', 'SimilarityScore']]

# Generate lookalikes for the first 20 customers
lookalike_results = {}
for customer_id in customers_df['CustomerID'][:20]:
    lookalike_results[customer_id] = find_lookalikes(customer_id).values.tolist()

# Save the lookalike results to a CSV file
lookalike_df = pd.DataFrame.from_dict(lookalike_results, orient='index')
lookalike_df.to_csv('FirstName_LastName_Lookalike.csv', header=False)

# Top 3 Lookalikes for First 20 Customers
Generate the top 3 lookalikes with their similarity scores for the first 20 customers and save to Lookalike.csv.

In [7]:
# Feature Engineering

# Create features from customer and product information for the Lookalike Model

# Calculate the recency of the last transaction for each customer
latest_transaction_date = transactions_df.groupby('CustomerID')['TransactionDate'].max().reset_index()
latest_transaction_date.columns = ['CustomerID', 'LastTransactionDate']
latest_transaction_date['Recency'] = (transactions_df['TransactionDate'].max() - latest_transaction_date['LastTransactionDate']).dt.days

# Calculate the frequency of transactions for each customer
transaction_frequency = transactions_df.groupby('CustomerID')['TransactionID'].count().reset_index()
transaction_frequency.columns = ['CustomerID', 'Frequency']

# Calculate the monetary value of transactions for each customer
transaction_monetary = transactions_df.groupby('CustomerID')['TotalValue'].sum().reset_index()
transaction_monetary.columns = ['CustomerID', 'Monetary']

# Merge the recency, frequency, and monetary features with the customer data
customer_features = customers_df.merge(latest_transaction_date[['CustomerID', 'Recency']], on='CustomerID')
customer_features = customer_features.merge(transaction_frequency[['CustomerID', 'Frequency']], on='CustomerID')
customer_features = customer_features.merge(transaction_monetary[['CustomerID', 'Monetary']], on='CustomerID')

# One-hot encode the 'Region' feature
customer_features = pd.get_dummies(customer_features, columns=['Region'])

# Normalize the numerical features
scaler = StandardScaler()
customer_features[['Recency', 'Frequency', 'Monetary']] = scaler.fit_transform(customer_features[['Recency', 'Frequency', 'Monetary']])

# Display the first few rows of the engineered features
print(customer_features.head())

# Model Development

from sklearn.neighbors import NearestNeighbors

# Prepare the data for the lookalike model
customer_features = customer_features[['CustomerID', 'Recency', 'Frequency', 'Monetary'] + [col for col in customer_features.columns if 'Region_' in col]]

# Fit the Nearest Neighbors model
nn_model = NearestNeighbors(n_neighbors=4, metric='euclidean')
nn_model.fit(customer_features.drop(columns=['CustomerID']))

# Function to find lookalikes for a given customer
def find_lookalikes(customer_id):
    customer_index = customer_features[customer_features['CustomerID'] == customer_id].index[0]
    distances, indices = nn_model.kneighbors([customer_features.drop(columns=['CustomerID']).iloc[customer_index]])
    lookalikes = customer_features.iloc[indices[0][1:]]
    lookalikes['SimilarityScore'] = 1 / (1 + distances[0][1:])
    return lookalikes[['CustomerID', 'SimilarityScore']]

# Generate lookalikes for the first 20 customers
lookalike_results = {}
for customer_id in customers_df['CustomerID'][:20]:
    lookalike_results[customer_id] = find_lookalikes(customer_id).values.tolist()

# Save the lookalike results to a CSV file
lookalike_df = pd.DataFrame.from_dict(lookalike_results, orient='index')
print(lookalike_df)
lookalike_df.to_csv('FirstName_LastName_Lookalike.csv', header=False)

  CustomerID        CustomerName SignupDate   Recency  Frequency  Monetary  \
0      C0001    Lawrence Carroll 2022-07-10 -0.266933  -0.011458 -0.061701   
1      C0002      Elizabeth Lutz 2022-02-13 -0.690872  -0.467494 -0.877744   
2      C0003      Michael Rivera 2024-03-07  0.722260  -0.467494 -0.405857   
3      C0004  Kathleen Rodriguez 2022-10-09 -0.987630   1.356650  1.032547   
4      C0005         Laura Weber 2022-08-15 -0.281064  -0.923530 -0.783929   

   Region_Asia  Region_Europe  Region_North America  Region_South America  
0        False          False                 False                  True  
1         True          False                 False                 False  
2        False          False                 False                  True  
3        False          False                 False                  True  
4         True          False                 False                 False  
                                  0                             1  \
C0001 