# Assignment 2 Lookalike Model

## Load the dataset and know them

In [24]:
import pandas as pd

# Datasets
customers = pd.read_csv("Customers.csv")
products = pd.read_csv("Products.csv")
transactions = pd.read_csv("Transactions.csv")

#Sample Data

transactions.head()


Unnamed: 0,TransactionID,CustomerID,ProductID,TransactionDate,Quantity,TotalValue,Price
0,T00001,C0199,P067,2024-08-25 12:38:23,1,300.68,300.68
1,T00112,C0146,P067,2024-05-27 22:23:54,1,300.68,300.68
2,T00166,C0127,P067,2024-04-25 07:38:55,1,300.68,300.68
3,T00272,C0087,P067,2024-03-26 22:55:37,2,601.36,300.68
4,T00363,C0070,P067,2024-03-21 15:10:10,3,902.04,300.68


In [26]:
# Merging all
data = transactions.merge(products, on='ProductID').merge(customers, on='CustomerID')
data.head()


Unnamed: 0,TransactionID,CustomerID,ProductID,TransactionDate,Quantity,TotalValue,Price_x,ProductName,Category,Price_y,CustomerName,Region,SignupDate
0,T00001,C0199,P067,2024-08-25 12:38:23,1,300.68,300.68,ComfortLiving Bluetooth Speaker,Electronics,300.68,Andrea Jenkins,Europe,2022-12-03
1,T00761,C0199,P022,2024-10-01 05:57:09,4,550.16,137.54,HomeSense Wall Art,Home Decor,137.54,Andrea Jenkins,Europe,2022-12-03
2,T00626,C0199,P079,2024-08-17 12:06:08,2,834.74,417.37,ActiveWear Rug,Home Decor,417.37,Andrea Jenkins,Europe,2022-12-03
3,T00963,C0199,P008,2024-10-26 00:01:58,2,293.7,146.85,BookWorld Bluetooth Speaker,Electronics,146.85,Andrea Jenkins,Europe,2022-12-03
4,T00112,C0146,P067,2024-05-27 22:23:54,1,300.68,300.68,ComfortLiving Bluetooth Speaker,Electronics,300.68,Brittany Harvey,Asia,2024-09-04


In [27]:
total_spending = data.groupby('CustomerID')['TotalValue'].sum().reset_index()
total_spending.rename(columns={'TotalValue': 'TotalSpending'}, inplace=True)

# purchase frequency 
transaction_count = data.groupby('CustomerID')['TransactionID'].nunique().reset_index()
transaction_count.rename(columns={'TransactionID': 'TransactionCount'}, inplace=True)

#spending by category 
category_spending = data.pivot_table(
    index='CustomerID',
    columns='Category',
    values='TotalValue',
    aggfunc='sum',
    fill_value=0
).reset_index()

# Combine all calculated features 
customer_features = customers.merge(total_spending, on='CustomerID', how='left')
customer_features = customer_features.merge(transaction_count, on='CustomerID', how='left')
customer_features = customer_features.merge(category_spending, on='CustomerID', how='left')

customer_features.fillna(0, inplace=True)

customer_features.head()


Unnamed: 0,CustomerID,CustomerName,Region,SignupDate,TotalSpending,TransactionCount,Books,Clothing,Electronics,Home Decor
0,C0001,Lawrence Carroll,South America,2022-07-10,3354.52,5.0,114.6,0.0,2827.3,412.62
1,C0002,Elizabeth Lutz,Asia,2022-02-13,1862.74,4.0,0.0,1025.46,0.0,837.28
2,C0003,Michael Rivera,South America,2024-03-07,2725.38,4.0,0.0,122.36,1385.2,1217.82
3,C0004,Kathleen Rodriguez,South America,2022-10-09,5354.88,8.0,1888.48,0.0,1355.74,2110.66
4,C0005,Laura Weber,Asia,2022-08-15,2034.24,3.0,0.0,0.0,1180.38,853.86


In [28]:
from sklearn.preprocessing import StandardScaler

# Standardization
numeric_columns = customer_features.drop(columns=['CustomerID', 'CustomerName', 'Region', 'SignupDate'])
scaler = StandardScaler()
scaled_data = scaler.fit_transform(numeric_columns)

# new DataFrame 
scaled_customer_features = pd.DataFrame(scaled_data, columns=numeric_columns.columns)
scaled_customer_features.insert(0, 'CustomerID', customer_features['CustomerID'])


In [29]:
from sklearn.preprocessing import StandardScaler
from sklearn.metrics.pairwise import cosine_similarity
import pandas as pd

# Assuming `customer_features` is already defined

# Select numerical features for scaling
numerical_features = customer_features.drop(columns=['CustomerID', 'CustomerName', 'Region', 'SignupDate'])
scaler = StandardScaler()
scaled_features = scaler.fit_transform(numerical_features)

# Compute cosine similarity
cosine_sim_matrix = cosine_similarity(scaled_features)

# Convert similarity matrix to a DataFrame
cosine_sim_df = pd.DataFrame(
    cosine_sim_matrix, 
    index=customer_features['CustomerID'], 
    columns=customer_features['CustomerID']
)

# Display the first few rows of the similarity DataFrame
cosine_sim_df.head()


CustomerID,C0001,C0002,C0003,C0004,C0005,C0006,C0007,C0008,C0009,C0010,...,C0191,C0192,C0193,C0194,C0195,C0196,C0197,C0198,C0199,C0200
CustomerID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
C0001,1.0,-0.319065,0.604081,0.027565,0.505113,-0.886533,0.534018,-0.133192,0.125816,-0.313221,...,-0.056504,0.736715,-0.373994,0.419032,-0.031489,-0.662172,0.350923,-0.102808,0.151223,-0.709531
C0002,-0.319065,1.0,0.336621,-0.610442,0.511188,0.113787,0.3772,-0.133261,0.748672,0.794772,...,-0.363052,0.239842,-0.209146,-0.24395,0.1921,0.099372,0.611969,0.824284,0.659068,-0.101643
C0003,0.604081,0.336621,1.0,0.005894,0.899089,-0.607972,0.971694,-0.198991,0.393511,-0.103627,...,-0.380259,0.54833,-0.478794,-0.01826,0.439101,-0.155883,0.886128,0.249401,0.851629,-0.836383
C0004,0.027565,-0.610442,0.005894,1.0,-0.350137,-0.040762,-0.110354,0.526417,-0.814908,-0.902839,...,0.005825,-0.587686,-0.035718,0.167316,0.489748,0.35282,-0.314026,-0.893832,-0.060197,-0.105431
C0005,0.505113,0.511188,0.899089,-0.350137,1.0,-0.53402,0.948695,-0.528756,0.724524,0.230918,...,-0.126422,0.73337,-0.195534,-0.283222,0.072434,-0.325046,0.981701,0.603625,0.842415,-0.799318


In [30]:
# Function to get top 3 similar customers for each customer
def get_top_similar_customers(similarity_df, top_n=3):
    similar_customers = {}
    for customer in similarity_df.index:
        # Get similarity scores for the current customer
        scores = similarity_df.loc[customer].sort_values(ascending=False)
        # Exclude the customer itself and select top N similar customers
        top_similar = scores.iloc[1:top_n + 1].reset_index()
        similar_customers[customer] = top_similar.values.tolist()
    return similar_customers

# Get recommendations for the first 20 customers
recommendations = get_top_similar_customers(cosine_sim_df.loc['C0001':'C0020'], top_n=3)
recommendations


{'C0001': [['C0069', 0.979454959557427],
  ['C0091', 0.9642200859036124],
  ['C0181', 0.8992721755431989]],
 'C0002': [['C0159', 0.956175625695053],
  ['C0036', 0.9306931974215171],
  ['C0129', 0.8850716127598941]],
 'C0003': [['C0007', 0.9716942768861005],
  ['C0166', 0.9532145909456898],
  ['C0085', 0.9249826779904693]],
 'C0004': [['C0075', 0.9809085265987747],
  ['C0090', 0.9387392358590677],
  ['C0065', 0.9262010838352783]],
 'C0005': [['C0197', 0.9817011715903354],
  ['C0085', 0.966843790367],
  ['C0007', 0.9486945029697087]],
 'C0006': [['C0200', 0.8474845246319106],
  ['C0185', 0.8398239332054643],
  ['C0169', 0.8199802177870191]],
 'C0007': [['C0085', 0.9824230037367925],
  ['C0003', 0.9716942768861005],
  ['C0005', 0.9486945029697087]],
 'C0008': [['C0109', 0.9279776786344583],
  ['C0024', 0.8375352176423817],
  ['C0173', 0.8128169916035377]],
 'C0009': [['C0032', 0.990757862861617],
  ['C0083', 0.9798033869624233],
  ['C0077', 0.9692019591273204]],
 'C0010': [['C0029', 0.998

In [31]:
lookalike_data = []
for customer, similar_list in recommendations.items():
    for similar_customer, score in similar_list:
        lookalike_data.append([customer, similar_customer, score])

lookalike_df = pd.DataFrame(lookalike_data, columns=['CustomerID', 'SimilarCustomerID', 'SimilarityScore'])

lookalike_df.to_csv("Lookalike.csv", index=False)
