#### Import libraries

In [28]:
import pandas as pd
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.preprocessing import StandardScaler

#### Load data

In [37]:
merged_data = pd.read_csv("Final_clean_data.csv")

#### Feature engineering
##### Aggregate customer transaction behavior

In [38]:
customer_features = merged_data.groupby("CustomerID").agg({
    "TotalValue": "sum",  # Total spend
    "Quantity": "sum",    # Total quantity purchased
    "ProductID": "nunique",  # Number of unique products purchased
    "Category": "nunique"  # Number of unique categories
}).reset_index()


#### Merge with customer profile information

In [39]:
customer_profiles = pd.read_csv("Customers.csv")
customer_features = customer_features.merge(customer_profiles, on="CustomerID", how="left")


In [40]:
customer_features.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 199 entries, 0 to 198
Data columns (total 8 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   CustomerID    199 non-null    object 
 1   TotalValue    199 non-null    float64
 2   Quantity      199 non-null    int64  
 3   ProductID     199 non-null    int64  
 4   Category      199 non-null    int64  
 5   CustomerName  199 non-null    object 
 6   Region        199 non-null    object 
 7   SignupDate    199 non-null    object 
dtypes: float64(1), int64(3), object(4)
memory usage: 14.0+ KB


#### Encode categorical features 

In [41]:
customer_features = pd.get_dummies(customer_features, columns=["Region"], drop_first=True)


#### Normalize numerical features

In [43]:
scaler = StandardScaler()
numerical_features = ["TotalValue", "Quantity", "ProductID", "Category"]
customer_features[numerical_features] = scaler.fit_transform(customer_features[numerical_features])



#### Compute similarity scores

In [44]:
similarity_matrix = cosine_similarity(customer_features[numerical_features])


#### Build lookalike recommendations for first 20 customers

In [45]:
lookalikes = {}
for idx, customer_id in enumerate(customer_features["CustomerID"][:20]):
    similar_customers = sorted(
        list(enumerate(similarity_matrix[idx])),
        key=lambda x: x[1], reverse=True
    )[1:4]  # Top 3 excluding itself
    
    lookalikes[customer_id] = [
        (customer_features["CustomerID"].iloc[sim[0]], sim[1])
        for sim in similar_customers
    ]

#### Save lookalikes to CSV

In [46]:

lookalikes_df = pd.DataFrame.from_dict(lookalikes, orient="index")
lookalikes_df.to_csv("Nikita_Mate_Lookalike.csv", header=False)


In [47]:
lookalikes_df

Unnamed: 0,0,1,2
C0001,"(C0056, 0.937994040227549)","(C0164, 0.9307627671474238)","(C0116, 0.9170817136676287)"
C0002,"(C0199, 0.9924781135690828)","(C0142, 0.9880872862686263)","(C0010, 0.9711735190184863)"
C0003,"(C0027, 0.8715087905460797)","(C0166, 0.7558069799219821)","(C0029, 0.678853897054224)"
C0004,"(C0124, 0.9885565212761119)","(C0195, 0.9879542415506737)","(C0156, 0.9879375303357136)"
C0005,"(C0131, 0.999692805150406)","(C0058, 0.9996223015540427)","(C0097, 0.9987487502678624)"
C0006,"(C0079, 0.9998807121946304)","(C0196, 0.9861922273326098)","(C0026, 0.8081475746461997)"
C0007,"(C0078, 0.9959032019158693)","(C0080, 0.9955154812840622)","(C0020, 0.9943060609259584)"
C0008,"(C0162, 0.9581866194529632)","(C0017, 0.9524675930003565)","(C0093, 0.9387553783693624)"
C0009,"(C0083, 0.9962580918556325)","(C0198, 0.9859642161840871)","(C0015, 0.9783121781932388)"
C0010,"(C0142, 0.9752861447632982)","(C0002, 0.9711735190184863)","(C0199, 0.9348224643446227)"
