In [1]:
# Import necessary libraries for data manipulation, scaling, and similarity measurement
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler
from sklearn.metrics.pairwise import cosine_similarity

In [2]:
# Load datasets: Customers, Products, and Transactions
customers = pd.read_csv('Customers.csv')
products = pd.read_csv('Products.csv')
transactions = pd.read_csv('Transactions.csv')

In [3]:
merged_data = pd.merge(transactions, customers, on="CustomerID", how="left")
merged_data = pd.merge(merged_data, products, on="ProductID", how="left")

In [4]:
# Create customer profiles by aggregating transaction data for each customer
customer_profile = merged_data.groupby('CustomerID').agg(
    total_spending=('TotalValue', 'sum'),
    transaction_count=('TransactionID', 'count'),
    unique_products=('ProductID', 'nunique')
).reset_index()

In [5]:
scaler = StandardScaler()
customer_profile[['total_spending', 'transaction_count', 'unique_products']] = scaler.fit_transform(
    customer_profile[['total_spending', 'transaction_count', 'unique_products']]
)

In [6]:
customer_profile = pd.merge(customers, customer_profile, on="CustomerID", how="left", suffixes=('', '_profile'))


In [7]:
features = customer_profile[['total_spending', 'transaction_count', 'unique_products']]


In [8]:
# Check for null values in the customer profile to ensure data completeness
null_values = customer_profile.isnull().sum()
print("Null values in each column:\n", null_values)


Null values in each column:
 CustomerID           0
CustomerName         0
Region               0
SignupDate           0
total_spending       1
transaction_count    1
unique_products      1
dtype: int64


In [9]:
# Handle missing values by filling them with mean (for numerical data) or mode
customer_profile['total_spending'].fillna(customer_profile['total_spending'].mean(), inplace=True)
customer_profile['transaction_count'].fillna(customer_profile['transaction_count'].mode()[0], inplace=True)
customer_profile['unique_products'].fillna(customer_profile['unique_products'].mode()[0], inplace=True)

# Verify that all null values have been handled successfully
null_values_after = customer_profile.isnull().sum()
print("Null values after handling:\n", null_values_after)

Null values after handling:
 CustomerID           0
CustomerName         0
Region               0
SignupDate           0
total_spending       0
transaction_count    0
unique_products      0
dtype: int64


The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  customer_profile['total_spending'].fillna(customer_profile['total_spending'].mean(), inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  customer_profile['transaction_count'].fillna(customer_profile['transaction_count'].mode()[0], inplace=True)
The behavior will change 

In [10]:
customer_profile

Unnamed: 0,CustomerID,CustomerName,Region,SignupDate,total_spending,transaction_count,unique_products
0,C0001,Lawrence Carroll,South America,2022-07-10,-0.061701,-0.011458,0.050047
1,C0002,Elizabeth Lutz,Asia,2022-02-13,-0.877744,-0.467494,-0.424204
2,C0003,Michael Rivera,South America,2024-03-07,-0.405857,-0.467494,-0.424204
3,C0004,Kathleen Rodriguez,South America,2022-10-09,1.032547,1.356650,1.472798
4,C0005,Laura Weber,Asia,2022-08-15,-0.783929,-0.923530,-0.898455
...,...,...,...,...,...,...,...
195,C0196,Laura Watts,Europe,2022-06-07,0.829053,-0.467494,-0.898455
196,C0197,Christina Harvey,Europe,2023-03-21,-0.841689,-0.923530,-0.898455
197,C0198,Rebecca Ray,Europe,2022-02-27,-1.386975,-1.379566,-1.372705
198,C0199,Andrea Jenkins,Europe,2022-12-03,-0.813993,-0.467494,-0.424204


In [11]:
# Function to get top 3 lookalike customers based on cosine similarity
def get_top_lookalikes(customer_id, num_lookalikes=3):
    customer_index = customer_profile[customer_profile['CustomerID'] == customer_id].index[0]
    similarity_scores = cosine_sim[customer_index]
    similar_indices = similarity_scores.argsort()[-(num_lookalikes + 1):-1]  # Exclude self similarity
    similar_customers = customer_profile.iloc[similar_indices]
    similar_customers['score'] = similarity_scores[similar_indices]
    return similar_customers[['CustomerID', 'score']]


In [19]:
# Fill NaN values in the features DataFrame
features.fillna(features.mean(), inplace=True)

# Compute cosine similarity
cosine_sim = cosine_similarity(features)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  features.fillna(features.mean(), inplace=True)


In [17]:
# Create a list of top 3 lookalikes for each of the first 20 customers
lookalike_data = []
for customer_id in customer_profile['CustomerID'][:20]:
    top_lookalikes = get_top_lookalikes(customer_id)
    lookalike_data.append({
        'cust_id': customer_id,
        'lookalikes': top_lookalikes.to_dict(orient='records')
    })

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  similar_customers['score'] = similarity_scores[similar_indices]
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  similar_customers['score'] = similarity_scores[similar_indices]
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  similar_customers['score'] = similarity_scores[similar_indices]
A value is tr

In [18]:
lookalike_df = pd.DataFrame(lookalike_data)
lookalike_df.to_csv('Lookalike.csv', index=False)

# Display the result
print(lookalike_df.head())

  cust_id                                         lookalikes
0   C0001  [{'CustomerID': 'C0056', 'score': 0.9304265739...
1   C0002  [{'CustomerID': 'C0010', 'score': 0.9991818003...
2   C0003  [{'CustomerID': 'C0131', 'score': 0.9995698787...
3   C0004  [{'CustomerID': 'C0108', 'score': 0.9993154296...
4   C0005  [{'CustomerID': 'C0095', 'score': 0.9999466322...
