In [1]:
import pandas as pd
from sklearn.preprocessing import StandardScaler
from sklearn.neighbors import NearestNeighbors


In [3]:
customers=pd.read_csv('/Users/nihaalnadaf/Downloads/Customers.csv')
products=pd.read_csv('/Users/nihaalnadaf/Downloads/Products.csv')
transactions=pd.read_csv('/Users/nihaalnadaf/Downloads/Transactions.csv')

In [22]:
# Merge datasets
data = customers.merge(transactions, on="CustomerID", how="left") 
data = data.merge(products, on="ProductID", how="left") 

In [24]:
data.head()

Unnamed: 0,CustomerID,CustomerName,Region,SignupDate,TransactionID,ProductID,TransactionDate,Quantity,TotalValue,Price_x,ProductName,Category,Price_y
0,C0001,Lawrence Carroll,South America,2022-07-10,T00015,P054,2024-01-19 03:12:55,2.0,114.6,57.3,SoundWave Cookbook,Books,57.3
1,C0001,Lawrence Carroll,South America,2022-07-10,T00932,P022,2024-09-17 09:01:18,3.0,412.62,137.54,HomeSense Wall Art,Home Decor,137.54
2,C0001,Lawrence Carroll,South America,2022-07-10,T00085,P096,2024-04-08 00:01:00,2.0,614.94,307.47,SoundWave Headphones,Electronics,307.47
3,C0001,Lawrence Carroll,South America,2022-07-10,T00445,P083,2024-05-07 03:11:44,2.0,911.44,455.72,ActiveWear Smartwatch,Electronics,455.72
4,C0001,Lawrence Carroll,South America,2022-07-10,T00436,P029,2024-11-02 17:04:16,3.0,1300.92,433.64,TechPro Headphones,Electronics,433.64


In [26]:
data['ProductPrice']=data['Price_x']
data=data.drop(columns='Price_x')
data=data.drop(columns='Price_y')

In [28]:
data.head(2)

Unnamed: 0,CustomerID,CustomerName,Region,SignupDate,TransactionID,ProductID,TransactionDate,Quantity,TotalValue,ProductName,Category,ProductPrice
0,C0001,Lawrence Carroll,South America,2022-07-10,T00015,P054,2024-01-19 03:12:55,2.0,114.6,SoundWave Cookbook,Books,57.3
1,C0001,Lawrence Carroll,South America,2022-07-10,T00932,P022,2024-09-17 09:01:18,3.0,412.62,HomeSense Wall Art,Home Decor,137.54


## Feature Engineering

In [140]:
customer_features = data.groupby("CustomerID").agg({
    "TotalValue": "sum",  
    "Quantity": "mean", 
    "ProductPrice": "mean",  
    "Category": lambda x: x.mode()[0] if not x.mode().empty else None,
    "Region": "first" 
}).reset_index()


In [142]:
customer_features 

Unnamed: 0,CustomerID,TotalValue,Quantity,ProductPrice,Category,Region
0,C0001,3354.52,2.400000,278.334000,Electronics,South America
1,C0002,1862.74,2.500000,208.920000,Clothing,Asia
2,C0003,2725.38,3.500000,195.707500,Home Decor,South America
3,C0004,5354.88,2.875000,240.636250,Books,South America
4,C0005,2034.24,2.333333,291.603333,Electronics,Asia
...,...,...,...,...,...,...
195,C0196,4982.88,3.000000,416.992500,Home Decor,Europe
196,C0197,1928.65,3.000000,227.056667,Electronics,Europe
197,C0198,931.83,1.500000,239.705000,Clothing,Europe
198,C0199,1979.28,2.250000,250.610000,Electronics,Europe


In [144]:
customer_features.isnull().sum()

CustomerID      0
TotalValue      0
Quantity        1
ProductPrice    1
Category        1
Region          0
dtype: int64

In [146]:
numerical_columns = ["TotalValue", "Quantity", "ProductPrice"]
customer_features[numerical_columns] = customer_features[numerical_columns].fillna(0)


In [148]:
customer_features['Category'] = customer_features['Category'].fillna('unknown')

In [150]:
customer_features.head()

Unnamed: 0,CustomerID,TotalValue,Quantity,ProductPrice,Category,Region
0,C0001,3354.52,2.4,278.334,Electronics,South America
1,C0002,1862.74,2.5,208.92,Clothing,Asia
2,C0003,2725.38,3.5,195.7075,Home Decor,South America
3,C0004,5354.88,2.875,240.63625,Books,South America
4,C0005,2034.24,2.333333,291.603333,Electronics,Asia


In [152]:
customer_features.isnull().sum()

CustomerID      0
TotalValue      0
Quantity        0
ProductPrice    0
Category        0
Region          0
dtype: int64

### One-hot encode for Region and Category features

In [155]:
from sklearn.preprocessing import OneHotEncoder
encoder = OneHotEncoder(sparse_output=False)
categorical_columns = ['Region', 'Category']
encoded_data = encoder.fit_transform(customer_features[categorical_columns])
encoded_df = pd.DataFrame(encoded_data, columns=encoder.get_feature_names_out(categorical_columns))
customer_features = customer_features.drop(columns=categorical_columns)
customer_features = pd.concat([customer_features, encoded_df], axis=1)

In [157]:
customer_features .head(2)

Unnamed: 0,CustomerID,TotalValue,Quantity,ProductPrice,Region_Asia,Region_Europe,Region_North America,Region_South America,Category_Books,Category_Clothing,Category_Electronics,Category_Home Decor,Category_unknown
0,C0001,3354.52,2.4,278.334,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0
1,C0002,1862.74,2.5,208.92,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0


In [159]:
# Normalize the data
scaler = StandardScaler()
normalized_features = scaler.fit_transform(customer_features.drop("CustomerID", axis=1))

## K-Nearest Neighbors model for similarity

In [161]:
knn = NearestNeighbors(n_neighbors=4, metric="cosine")  
knn.fit(normalized_features)

In [164]:
lookalikes = {}
for idx, customer_id in enumerate(customer_features["CustomerID"][:20]):
    distances, indices = knn.kneighbors([normalized_features[idx]]) 
    similar_customers = [
        (customer_features["CustomerID"][i], round(1 - distances[0][j], 2)) 
        for j, i in enumerate(indices[0][1:]) 
    ]
    lookalikes[customer_id] = similar_customers


### Save the results to a CSV 


In [166]:
lookalike_df = pd.DataFrame({
    "CustomerID": lookalikes.keys(),
    "SimilarCustomers": [str(v) for v in lookalikes.values()]
})
lookalike_df.to_csv("/Users/nihaalnadaf/Documents/Transactions/Nihaal_Nadaf_Lookalike.csv", index=False)

In [168]:
lookalike_df

Unnamed: 0,CustomerID,SimilarCustomers
0,C0001,"[('C0181', 1.0), ('C0192', 1.0), ('C0048', 0.95)]"
1,C0002,"[('C0106', 1.0), ('C0088', 0.97), ('C0134', 0...."
2,C0003,"[('C0195', 1.0), ('C0151', 0.91), ('C0113', 0.9)]"
3,C0004,"[('C0153', 1.0), ('C0165', 0.98), ('C0087', 0...."
4,C0005,"[('C0140', 1.0), ('C0186', 1.0), ('C0146', 0.99)]"
5,C0006,"[('C0171', 1.0), ('C0168', 0.98), ('C0011', 0...."
6,C0007,"[('C0115', 1.0), ('C0186', 0.98), ('C0140', 0...."
7,C0008,"[('C0189', 1.0), ('C0038', 0.96), ('C0160', 0...."
8,C0009,"[('C0103', 1.0), ('C0198', 0.97), ('C0061', 0...."
9,C0010,"[('C0111', 1.0), ('C0062', 0.99), ('C0149', 0...."
