#import all the library which needed in this project

In [30]:
import pandas as pd
from sklearn.preprocessing import StandardScaler
from sklearn.metrics.pairwise import cosine_similarity
import seaborn as sns

#Load Data

In [31]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [32]:
customers = pd.read_csv("/content/drive/MyDrive/Zeotap Data intern project/Customers.csv")
products = pd.read_csv("/content/drive/MyDrive/Zeotap Data intern project/Products.csv")
transactions = pd.read_csv("/content/drive/MyDrive/Zeotap Data intern project/Transactions.csv")

In [7]:
transactions.head()

Unnamed: 0,TransactionID,CustomerID,ProductID,TransactionDate,Quantity,TotalValue,Price
0,T00001,C0199,P067,2024-08-25 12:38:23,1,300.68,300.68
1,T00112,C0146,P067,2024-05-27 22:23:54,1,300.68,300.68
2,T00166,C0127,P067,2024-04-25 07:38:55,1,300.68,300.68
3,T00272,C0087,P067,2024-03-26 22:55:37,2,601.36,300.68
4,T00363,C0070,P067,2024-03-21 15:10:10,3,902.04,300.68


In [8]:
print(customers.isnull().sum())
print(products.isnull().sum())
print(transactions.isnull().sum())

CustomerID      0
CustomerName    0
Region          0
SignupDate      0
dtype: int64
ProductID      0
ProductName    0
Category       0
Price          0
dtype: int64
TransactionID      0
CustomerID         0
ProductID          0
TransactionDate    0
Quantity           0
TotalValue         0
Price              0
dtype: int64


Mounted at /content/drive


# Merge all three dataset

In [9]:
data = transactions.merge(customers, on="CustomerID").merge(products, on="ProductID")


In [10]:
data.head()

Unnamed: 0,TransactionID,CustomerID,ProductID,TransactionDate,Quantity,TotalValue,Price_x,CustomerName,Region,SignupDate,ProductName,Category,Price_y
0,T00001,C0199,P067,2024-08-25 12:38:23,1,300.68,300.68,Andrea Jenkins,Europe,2022-12-03,ComfortLiving Bluetooth Speaker,Electronics,300.68
1,T00112,C0146,P067,2024-05-27 22:23:54,1,300.68,300.68,Brittany Harvey,Asia,2024-09-04,ComfortLiving Bluetooth Speaker,Electronics,300.68
2,T00166,C0127,P067,2024-04-25 07:38:55,1,300.68,300.68,Kathryn Stevens,Europe,2024-04-04,ComfortLiving Bluetooth Speaker,Electronics,300.68
3,T00272,C0087,P067,2024-03-26 22:55:37,2,601.36,300.68,Travis Campbell,South America,2024-04-11,ComfortLiving Bluetooth Speaker,Electronics,300.68
4,T00363,C0070,P067,2024-03-21 15:10:10,3,902.04,300.68,Timothy Perez,Europe,2022-03-15,ComfortLiving Bluetooth Speaker,Electronics,300.68


#To check the no. of (rows,columns)

In [13]:
data.shape

(1000, 13)

# To see the datatypes of our dataset and also show the non_null count values

In [14]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000 entries, 0 to 999
Data columns (total 13 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   TransactionID    1000 non-null   object 
 1   CustomerID       1000 non-null   object 
 2   ProductID        1000 non-null   object 
 3   TransactionDate  1000 non-null   object 
 4   Quantity         1000 non-null   int64  
 5   TotalValue       1000 non-null   float64
 6   Price_x          1000 non-null   float64
 7   CustomerName     1000 non-null   object 
 8   Region           1000 non-null   object 
 9   SignupDate       1000 non-null   object 
 10  ProductName      1000 non-null   object 
 11  Category         1000 non-null   object 
 12  Price_y          1000 non-null   float64
dtypes: float64(3), int64(1), object(9)
memory usage: 101.7+ KB


# Feature engineering
The process of using domin knowledge to extract features from raw data. these features can be used to improve the performance of machine learning Algorithm.


In [15]:
customer_features = data.groupby("CustomerID").agg({
    "Price_x": ["sum", "mean"],
    "TransactionID": "count",
    "Category": lambda x: list(x)
}).reset_index()
customer_features.columns = ["CustomerID", "TotalSpent", "AvgSpent", "TransactionCount", "Categories"]

In [37]:
customer_features.sample(5)

Unnamed: 0,CustomerID,TotalSpent,AvgSpent,TransactionCount,Categories,Books,Clothing,Electronics,Home Decor
6,C0007,1050.06,350.02,3,"[Electronics, Electronics, Home Decor]",0,0,2,1
77,C0078,497.76,497.76,1,[Books],1,0,0,0
161,C0162,2047.59,255.94875,8,"[Books, Clothing, Electronics, Electronics, Ho...",1,1,3,3
170,C0171,1609.59,321.918,5,"[Books, Books, Books, Books, Books]",5,0,0,0
107,C0108,1814.28,259.182857,7,"[Clothing, Clothing, Home Decor, Books, Home D...",2,3,0,2


In [17]:
customer_features.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 199 entries, 0 to 198
Data columns (total 5 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   CustomerID        199 non-null    object 
 1   TotalSpent        199 non-null    float64
 2   AvgSpent          199 non-null    float64
 3   TransactionCount  199 non-null    int64  
 4   Categories        199 non-null    object 
dtypes: float64(2), int64(1), object(2)
memory usage: 7.9+ KB


# To convert categorical column to numerical column
1) One_hot_encoding is used with nominal column or data.


In [18]:
category_features = pd.get_dummies(data.set_index("CustomerID")["Category"]).groupby("CustomerID").sum().reset_index()
print(category_features.head())
customer_features = customer_features.merge(category_features, on="CustomerID")


  CustomerID  Books  Clothing  Electronics  Home Decor
0      C0001      1         0            3           1
1      C0002      0         2            0           2
2      C0003      0         1            1           2
3      C0004      3         0            2           3
4      C0005      0         0            2           1


In [19]:
customer_features.head()

Unnamed: 0,CustomerID,TotalSpent,AvgSpent,TransactionCount,Categories,Books,Clothing,Electronics,Home Decor
0,C0001,1391.67,278.334,5,"[Books, Home Decor, Electronics, Electronics, ...",1,0,3,1
1,C0002,835.68,208.92,4,"[Home Decor, Home Decor, Clothing, Clothing]",0,2,0,2
2,C0003,782.83,195.7075,4,"[Home Decor, Home Decor, Clothing, Electronics]",0,1,1,2
3,C0004,1925.09,240.63625,8,"[Books, Home Decor, Home Decor, Home Decor, Bo...",3,0,2,3
4,C0005,874.81,291.603333,3,"[Home Decor, Electronics, Electronics]",0,0,2,1


In [21]:
customer_features.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 199 entries, 0 to 198
Data columns (total 9 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   CustomerID        199 non-null    object 
 1   TotalSpent        199 non-null    float64
 2   AvgSpent          199 non-null    float64
 3   TransactionCount  199 non-null    int64  
 4   Categories        199 non-null    object 
 5   Books             199 non-null    int64  
 6   Clothing          199 non-null    int64  
 7   Electronics       199 non-null    int64  
 8   Home Decor        199 non-null    int64  
dtypes: float64(2), int64(5), object(2)
memory usage: 14.1+ KB


#Describe function:
The describe() function in pandas provides a summary of key statistical measures for numerical columns in a dataset, such as:

**Count:** Number of non-null values.

**Mean:** Average of the values.

**Standard deviation:** Measure of data spread.

**Minimum and Maximum:** Range of the data.

**Percentiles (25%, 50%, 75%):** Distribution insights.

It helps in quickly understanding the data's distribution, variability, and identifying potential issues like outliers.

In [22]:
customer_features.describe()

Unnamed: 0,TotalSpent,AvgSpent,TransactionCount,Books,Clothing,Electronics,Home Decor
count,199.0,199.0,199.0,199.0,199.0,199.0,199.0
mean,1369.618442,271.753916,5.025126,1.356784,1.145729,1.276382,1.246231
std,663.369301,69.680617,2.19834,1.113889,1.102737,1.114185,1.116756
min,41.18,41.18,1.0,0.0,0.0,0.0,0.0
25%,895.375,231.054,3.0,1.0,0.0,0.0,0.0
50%,1301.1,271.602857,5.0,1.0,1.0,1.0,1.0
75%,1810.28,308.566125,6.0,2.0,2.0,2.0,2.0
max,3857.06,497.76,11.0,5.0,5.0,5.0,6.0


# Feature Scaling:
It is a technique to standardize the independent features to present in the data in a fixed range.


Two types of feacture scaling
 Standardization:

1.   Standardization
2.   Normalization


Standardization is the process of transforming data so that it has a mean of 0 and a standard deviation of 1. This ensures that all features contribute equally to the analysis, especially when they are measured on different scales.

Z=(xi-x')/SD


Normalization:
Normalization scales data to fit within a specific range, typically 0 to 1, preserving relative proportions. It ensures comparability of features, especially for algorithms sensitive to scale, like gradient descent.


In [23]:

scaler = StandardScaler()
scaled_features = scaler.fit_transform(customer_features.drop(["CustomerID", "Categories"], axis=1))


# Similarity Matrix:
Cosine similarity measures how similar two vectors are by calculating the cosine of the angle between them. It focuses on the direction of the vectors rather than their magnitude, making it useful for comparing text or feature vectors.




In [25]:
similarity_matrix = cosine_similarity(scaled_features)
similarity_matrix


array([[ 1.        , -0.54480409, -0.03381188, ...,  0.06426813,
         0.59483048, -0.5321083 ],
       [-0.54480409,  1.        ,  0.83623892, ...,  0.50034709,
         0.2801638 ,  0.29113014],
       [-0.03381188,  0.83623892,  1.        , ...,  0.5906888 ,
         0.71480888, -0.1646688 ],
       ...,
       [ 0.06426813,  0.50034709,  0.5906888 , ...,  1.        ,
         0.427442  ,  0.0860122 ],
       [ 0.59483048,  0.2801638 ,  0.71480888, ...,  0.427442  ,
         1.        , -0.46491023],
       [-0.5321083 ,  0.29113014, -0.1646688 , ...,  0.0860122 ,
        -0.46491023,  1.        ]])

# Top_3 customer **looks** like

In [38]:
#  create Looklikes dictionary
lookalikes = {}
for idx, customer_id in enumerate(customer_features["CustomerID"][:20]):
    # Get similarity scores for the current customer
    scores = similarity_matrix[idx]
    # Sort by similarity and exclude self
    top_3 = sorted([(customer_features["CustomerID"][i], scores[i]) for i in range(len(scores)) if i != idx],
                   key=lambda x: x[1], reverse=True)[:3]
    lookalikes[customer_id] = top_3
print(lookalikes)

{'C0001': [('C0069', 0.9501907058522422), ('C0127', 0.7913690444622133), ('C0120', 0.7806510053245792)], 'C0002': [('C0134', 0.9308277285329609), ('C0062', 0.9068409293898364), ('C0159', 0.8767826181304359)], 'C0003': [('C0031', 0.9639669546263142), ('C0166', 0.9283900037293031), ('C0189', 0.8804622993969058)], 'C0004': [('C0047', 0.937189885130038), ('C0194', 0.9206625096513794), ('C0090', 0.9203352043697169)], 'C0005': [('C0007', 0.9253924799360158), ('C0197', 0.9079942300519187), ('C0120', 0.8988238855416415)], 'C0006': [('C0135', 0.9555336164183147), ('C0185', 0.8275296586412438), ('C0131', 0.8200086853194648)], 'C0007': [('C0005', 0.9253924799360158), ('C0146', 0.8946041689296131), ('C0026', 0.8825463259993708)], 'C0008': [('C0162', 0.9509142555420406), ('C0181', 0.8980749425501472), ('C0113', 0.8958633751758829)], 'C0009': [('C0092', 0.891881404341469), ('C0040', 0.8430995475121343), ('C0150', 0.831989341413928)], 'C0010': [('C0077', 0.9433656058993513), ('C0176', 0.9340701007173

# Saving results as csv file of customer with  Top_3 similarCustomer

In [27]:

lookalike_df = pd.DataFrame([
    [cust] + [sim[0] for sim in lookalikes[cust]] + [sim[1] for sim in lookalikes[cust]]
    for cust in lookalikes
], columns=["CustomerID", "SimilarCustomer1", "SimilarCustomer2", "SimilarCustomer3",
            "Score1", "Score2", "Score3"])
lookalike_df.to_csv("Lookalike.csv", index=False)

# Test the model by entering user input try to get top_3 similar customer_id and similarity percentage

In [36]:

def test_lookalikes(customer_id):
    if customer_id in lookalikes:
        top_3 = lookalikes[customer_id]
        print(f"Top 3 lookalike customers for {customer_id}:")
        for i, (similar_cust, score) in enumerate(top_3, 1):
            print(f"{i}. CustomerID: {similar_cust}, Similarity Score: {score:.2f}")
    else:
        print(f"CustomerID {customer_id} is not in the top 20 customers.")

# User Input
user_input = input("Enter a CustomerID to test the model (e.g., C0001): ")
test_lookalikes(user_input.strip())

Enter a CustomerID to test the model (e.g., C0001): C0001
Top 3 lookalike customers for C0001:
1. CustomerID: C0069, Similarity Score: 0.95
2. CustomerID: C0127, Similarity Score: 0.79
3. CustomerID: C0120, Similarity Score: 0.78
