In [1]:
!pip install scikit-learn



In [2]:
import sklearn
print(sklearn.__version__)

1.3.0


1. Loading the Data

In [13]:
import pandas as pd

# Load the datasets
customers_df = pd.read_csv('Customers.csv')
products_df = pd.read_csv('Products.csv')
transactions_df = pd.read_csv('Transactions.csv')

# Display the first few rows of each dataset to understand their structure
customers_df.head(), products_df.head(), transactions_df.head()

(  CustomerID        CustomerName         Region  SignupDate
 0      C0001    Lawrence Carroll  South America  2022-07-10
 1      C0002      Elizabeth Lutz           Asia  2022-02-13
 2      C0003      Michael Rivera  South America  2024-03-07
 3      C0004  Kathleen Rodriguez  South America  2022-10-09
 4      C0005         Laura Weber           Asia  2022-08-15,
   ProductID              ProductName     Category   Price
 0      P001     ActiveWear Biography        Books  169.30
 1      P002    ActiveWear Smartwatch  Electronics  346.30
 2      P003  ComfortLiving Biography        Books   44.12
 3      P004            BookWorld Rug   Home Decor   95.69
 4      P005          TechPro T-Shirt     Clothing  429.31,
   TransactionID CustomerID ProductID      TransactionDate  Quantity  \
 0        T00001      C0199      P067  2024-08-25 12:38:23         1   
 1        T00112      C0146      P067  2024-05-27 22:23:54         1   
 2        T00166      C0127      P067  2024-04-25 07:38:55    

2. Data Preprocessing and Feature Engineering

In [14]:
# Merge transactions with customers to get customer information along with transactions
merged_df = pd.merge(transactions_df, customers_df, on='CustomerID')

# Calculate total spend per customer
merged_df['TotalSpend'] = merged_df['Quantity'] * merged_df['Price']
customer_features = merged_df.groupby('CustomerID').agg(
    total_spent=('TotalSpend', 'sum'),
    purchase_frequency=('TransactionID', 'count')
).reset_index()

# Merge with the customers dataframe to get region and signup date information
customer_features = pd.merge(customer_features, customers_df[['CustomerID', 'Region']], on='CustomerID')

# Display the new customer features
customer_features.head()

Unnamed: 0,CustomerID,total_spent,purchase_frequency,Region
0,C0001,3354.52,5,South America
1,C0002,1862.74,4,Asia
2,C0003,2725.38,4,South America
3,C0004,5354.88,8,South America
4,C0005,2034.24,3,Asia


3. Product Preference Representation

In [15]:
# Create a customer-product matrix: rows represent customers, columns represent products, and values are quantities
customer_product_matrix = merged_df.pivot_table(
    index='CustomerID', columns='ProductID', values='Quantity', aggfunc='sum', fill_value=0
)

# Display the customer-product matrix
customer_product_matrix.head()

ProductID,P001,P002,P003,P004,P005,P006,P007,P008,P009,P010,...,P091,P092,P093,P094,P095,P096,P097,P098,P099,P100
CustomerID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
C0001,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,2,0,0,0,0
C0002,0,0,0,4,0,0,0,0,0,0,...,0,0,0,0,2,0,0,0,0,0
C0003,0,4,0,0,0,3,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
C0004,0,0,0,0,0,0,0,2,0,0,...,0,0,0,0,0,0,3,0,0,0
C0005,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


4. Data Normalization

In [16]:
from sklearn.preprocessing import StandardScaler

# Normalize the customer-product matrix
scaler = StandardScaler()
normalized_matrix = scaler.fit_transform(customer_product_matrix)

# Display the normalized matrix (first 5 customers)
normalized_matrix[:5]

array([[-0.19155441, -0.19831279, -0.23403059, -0.17304493, -0.20175905,
        -0.24614845, -0.18947347, -0.22087051, -0.17279088, -0.22536027,
        -0.21116754, -0.17754277, -0.20966171, -0.14175444, -0.16701066,
        -0.18972568, -0.20788694, -0.17452778, -0.22952917, -0.2124663 ,
        -0.21837601,  3.87187453, -0.17693977, -0.14746422, -0.23276104,
        -0.16192659, -0.23346127, -0.24314056,  3.44757677, -0.18972568,
        -0.12865022, -0.2211743 , -0.19875551, -0.19259753, -0.20529159,
        -0.21085105, -0.17279088, -0.22758963, -0.23291368, -0.24614845,
        -0.21818701, -0.18787481, -0.22068712, -0.08270697, -0.19155441,
        -0.18697859, -0.21488208, -0.26361028, -0.23746991, -0.23382097,
        -0.21338733, -0.15333663, -0.20347407,  2.09866132, -0.2076852 ,
        -0.18495205, -0.24747817, -0.2046578 , -0.27233719, -0.14919877,
        -0.25427322, -0.2693437 , -0.20347407, -0.20326144, -0.22532583,
        -0.21085105, -0.24148109, -0.21345765, -0.2

5. Calculating Similarity (Cosine Similarity)

In [17]:
from sklearn.metrics.pairwise import cosine_similarity

# Compute the cosine similarity between all customers based on the normalized data
similarity_matrix = cosine_similarity(normalized_matrix)

# Display the similarity matrix (first 5 rows and columns)
similarity_matrix[:5, :5]

array([[ 1.        , -0.04882928, -0.06147586, -0.07906018, -0.05168909],
       [-0.04882928,  1.        , -0.03569919, -0.05168252, -0.02306581],
       [-0.06147586, -0.03569919,  1.        ,  0.04022236,  0.24429628],
       [-0.07906018, -0.05168252,  0.04022236,  1.        ,  0.07985298],
       [-0.05168909, -0.02306581,  0.24429628,  0.07985298,  1.        ]])

6. Recommend Top 3 Lookalikes

In [18]:
# Function to get top 3 lookalikes for each customer
def get_top_3_lookalikes(similarity_matrix, customer_id, customer_product_matrix):
    # Get the index of the given customer in the customer-product matrix
    customer_idx = customer_product_matrix.index.get_loc(customer_id)
    
    # Get the similarity scores for the given customer
    similarity_scores = similarity_matrix[customer_idx]
    
    # Sort similarity scores and get the top 3 customers (exclude self)
    similar_customers_idx = similarity_scores.argsort()[-4:-1][::-1]  # Get top 3 excluding self
    similar_customers = customer_product_matrix.index[similar_customers_idx]
    similarity_scores = similarity_scores[similar_customers_idx]
    
    return list(zip(similar_customers, similarity_scores))

# Test the function for a sample customer (e.g., 'C0001')
get_top_3_lookalikes(similarity_matrix, 'C0001', customer_product_matrix)

[('C0194', 0.4049275311893231),
 ('C0104', 0.3740015051203954),
 ('C0020', 0.36660865634533374)]

7. Create Lookalike Mapping for First 20 Customers

In [19]:
# Generate lookalikes for the first 20 customers
lookalike_mapping = {}
for customer_id in customer_product_matrix.index[:20]:  # First 20 customers
    lookalike_mapping[customer_id] = get_top_3_lookalikes(similarity_matrix, customer_id, customer_product_matrix)

# Display the lookalike mapping for a sample customer
lookalike_mapping['C0001']

[('C0194', 0.4049275311893231),
 ('C0104', 0.3740015051203954),
 ('C0020', 0.36660865634533374)]

8. Save Lookalikes to CSV

In [20]:
# Prepare the data for the Lookalike.csv file
lookalike_data = []
for customer_id, lookalikes in lookalike_mapping.items():
    lookalike_data.append({
        'CustomerID': customer_id,
        'Lookalike1': lookalikes[0][0],
        'Score1': lookalikes[0][1],
        'Lookalike2': lookalikes[1][0],
        'Score2': lookalikes[1][1],
        'Lookalike3': lookalikes[2][0],
        'Score3': lookalikes[2][1]
    })

# Convert to DataFrame and save to CSV
lookalike_df = pd.DataFrame(lookalike_data)
lookalike_df.to_csv('Lookalike.csv', index=False)

# Display the first few rows of the Lookalike.csv output
lookalike_df.head()

Unnamed: 0,CustomerID,Lookalike1,Score1,Lookalike2,Score2,Lookalike3,Score3
0,C0001,C0194,0.404928,C0104,0.374002,C0020,0.366609
1,C0002,C0030,0.404617,C0091,0.383778,C0071,0.320158
2,C0003,C0181,0.477572,C0134,0.471016,C0144,0.4238
3,C0004,C0070,0.351901,C0175,0.316098,C0132,0.279599
4,C0005,C0096,0.487456,C0023,0.470252,C0055,0.3821
