Task 2: Lookalike Model

In [1]:
import pandas as pd
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.preprocessing import StandardScaler

In [2]:
df_customers = pd.read_csv(r'C:\Users\sakth\OneDrive\文档\eCommerce_Dataset_Assignment\Datasets\Customers.csv')
df_products = pd.read_csv(r'C:\Users\sakth\OneDrive\文档\eCommerce_Dataset_Assignment\Datasets\Products.csv')
df_transactions = pd.read_csv(r'C:\Users\sakth\OneDrive\文档\eCommerce_Dataset_Assignment\Datasets\Transactions.csv')

So, Here we are just taking Quantity and Totalvalue to find the  cosine_similarity for top 3 customers

In [3]:
customer_mg = df_transactions.groupby('CustomerID').agg({
    'Quantity': 'sum',
    'TotalValue': 'sum'
}).reset_index()

In [8]:
customer_data = customer_mg.merge(df_customers, on='CustomerID')

In [9]:
customer_data

Unnamed: 0,CustomerID,Quantity,TotalValue,CustomerName,Region,SignupDate
0,C0001,12,3354.52,Lawrence Carroll,South America,2022-07-10
1,C0002,10,1862.74,Elizabeth Lutz,Asia,2022-02-13
2,C0003,14,2725.38,Michael Rivera,South America,2024-03-07
3,C0004,23,5354.88,Kathleen Rodriguez,South America,2022-10-09
4,C0005,7,2034.24,Laura Weber,Asia,2022-08-15
...,...,...,...,...,...,...
194,C0196,12,4982.88,Laura Watts,Europe,2022-06-07
195,C0197,9,1928.65,Christina Harvey,Europe,2023-03-21
196,C0198,3,931.83,Rebecca Ray,Europe,2022-02-27
197,C0199,9,1979.28,Andrea Jenkins,Europe,2022-12-03


Using Scaler technique

In [10]:
scaler = StandardScaler()
customer_data[['Quantity', 'TotalValue']] = scaler.fit_transform(customer_data[['Quantity', 'TotalValue']])
customer_data

Unnamed: 0,CustomerID,Quantity,TotalValue,CustomerName,Region,SignupDate
0,C0001,-0.122033,-0.061701,Lawrence Carroll,South America,2022-07-10
1,C0002,-0.448000,-0.877744,Elizabeth Lutz,Asia,2022-02-13
2,C0003,0.203934,-0.405857,Michael Rivera,South America,2024-03-07
3,C0004,1.670787,1.032547,Kathleen Rodriguez,South America,2022-10-09
4,C0005,-0.936951,-0.783929,Laura Weber,Asia,2022-08-15
...,...,...,...,...,...,...
194,C0196,-0.122033,0.829053,Laura Watts,Europe,2022-06-07
195,C0197,-0.610984,-0.841689,Christina Harvey,Europe,2023-03-21
196,C0198,-1.588886,-1.386975,Rebecca Ray,Europe,2022-02-27
197,C0199,-0.610984,-0.813993,Andrea Jenkins,Europe,2022-12-03


Finding the similarity scores

In [12]:
similarity_matrix = cosine_similarity(customer_data[['Quantity', 'TotalValue']])
similarity_matrix

array([[ 1.        ,  0.80759386,  0.0024998 , ...,  0.96902986,
         0.89659207, -0.89647264],
       [ 0.80759386,  1.        ,  0.59175604, ...,  0.92821483,
         0.98525255, -0.98529864],
       [ 0.0024998 ,  0.59175604,  1.        , ...,  0.24936518,
         0.4450973 , -0.44533872],
       ...,
       [ 0.96902986,  0.92821483,  0.24936518, ...,  1.        ,
         0.97818527, -0.97812923],
       [ 0.89659207,  0.98525255,  0.4450973 , ...,  0.97818527,
         1.        , -0.99999996],
       [-0.89647264, -0.98529864, -0.44533872, ..., -0.97812923,
        -0.99999996,  1.        ]])

finding the top 3 lookalike customers for (CustomerID: C0001 - C0020)  in Customers.csv

In [14]:
def get_top_3_lookalikes(customer_id, similarity_matrix, customer_ids):
    customer_index = customer_ids.index(customer_id)
    scores = similarity_matrix[customer_index]
    scored_customers = [(customer_ids[i], score) for i, score in enumerate(scores)]
    scored_customers.sort(key=lambda x: x[1], reverse=True)
    top_3_lookalikes = scored_customers[1:4]
    return top_3_lookalikes

In [16]:
lookalikes = {}
customer_ids = customer_data['CustomerID'].tolist()

In [17]:
for cust_id in customer_ids[:20]:
    top_3_lookalikes = get_top_3_lookalikes(cust_id, similarity_matrix, customer_ids)
    lookalikes[cust_id] = top_3_lookalikes

In [18]:
print(f"Customer ID: {cust_id}")
print("Top 3 Lookalikes:")
for lookalike_id, score in top_3_lookalikes:
    print(f" - {lookalike_id} (Similarity: {score:.2f})")
print()

Customer ID: C0020
Top 3 Lookalikes:
 - C0058 (Similarity: 1.00)
 - C0193 (Similarity: 1.00)
 - C0198 (Similarity: 1.00)



In [20]:
lookalike_df = pd.DataFrame({
    'CustomerID': list(lookalikes.keys()), 
    'Lookalikes': [str(lookalike) for lookalike in lookalikes.values()] 
})

In [22]:
lookalike_df.to_csv('Sakthirajkumar_K_Lookalike.csv ', index=False)

In [21]:
lookalike_df

Unnamed: 0,CustomerID,Lookalikes
0,C0001,"[('C0085', np.float64(0.9999990504724361)), ('..."
1,C0002,"[('C0157', np.float64(0.9999942410168485)), ('..."
2,C0003,"[('C0111', np.float64(0.9940081095432594)), ('..."
3,C0004,"[('C0162', np.float64(0.9999999965087093)), ('..."
4,C0005,"[('C0080', np.float64(0.999982235548051)), ('C..."
5,C0006,"[('C0079', np.float64(0.9999656845154902)), ('..."
6,C0007,"[('C0146', np.float64(0.9999895943808703)), ('..."
7,C0008,"[('C0109', np.float64(0.9998419065580372)), ('..."
8,C0009,"[('C0015', np.float64(0.9999998280836607)), ('..."
9,C0010,"[('C0176', np.float64(0.9977495907269393)), ('..."
