In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.cluster import KMeans
from sklearn.metrics import davies_bouldin_score
from sklearn.preprocessing import StandardScaler
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.decomposition import PCA

# Suppressing warnings
import warnings
warnings.filterwarnings("ignore")

# Loading the datasets
customers_df = pd.read_csv("/content/Customers.csv")
products_df = pd.read_csv("/content/Products.csv")
transactions_df = pd.read_csv("/content/Transactions.csv")

# Displaying head and info to inspect the datasets
print("Customers DataFrame:")
print(customers_df.head())
print(customers_df.info())

print("\nProducts DataFrame:")
print(products_df.head())
print(products_df.info())

print("\nTransactions DataFrame:")
print(transactions_df.head())
print(transactions_df.info())

Customers DataFrame:
  CustomerID        CustomerName         Region  SignupDate
0      C0001    Lawrence Carroll  South America  2022-07-10
1      C0002      Elizabeth Lutz           Asia  2022-02-13
2      C0003      Michael Rivera  South America  2024-03-07
3      C0004  Kathleen Rodriguez  South America  2022-10-09
4      C0005         Laura Weber           Asia  2022-08-15
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 200 entries, 0 to 199
Data columns (total 4 columns):
 #   Column        Non-Null Count  Dtype 
---  ------        --------------  ----- 
 0   CustomerID    200 non-null    object
 1   CustomerName  200 non-null    object
 2   Region        200 non-null    object
 3   SignupDate    200 non-null    object
dtypes: object(4)
memory usage: 6.4+ KB
None

Products DataFrame:
  ProductID              ProductName     Category   Price
0      P001     ActiveWear Biography        Books  169.30
1      P002    ActiveWear Smartwatch  Electronics  346.30
2      P003  ComfortLiv

In [3]:
# Converting date columns to datetime objects
customers_df['SignupDate'] = pd.to_datetime(customers_df['SignupDate'])
transactions_df['TransactionDate'] = pd.to_datetime(transactions_df['TransactionDate'])

#Feature Engineering steps
transactions_merged_df = pd.merge(transactions_df, products_df, on = "ProductID")
transactions_merged_df = pd.merge(transactions_merged_df, customers_df, on = "CustomerID")

customer_features = transactions_merged_df.groupby('CustomerID').agg(
    num_transactions=('TransactionID', 'count'),
    total_spending=('TotalValue', 'sum'),
    purchase_frequency = ('TransactionDate', lambda x: len(x) / (x.max() - x.min()).days if (x.max() - x.min()).days > 0 else 0),
    avg_transaction_value=('TotalValue', 'mean'),
    fav_product_category =('Category', lambda x: x.value_counts().index[0])
).reset_index()


customer_region_df = transactions_merged_df.groupby('CustomerID')['Region'].apply(lambda x: x.mode()[0]).reset_index()
customer_features = pd.merge(customer_features,customer_region_df, on="CustomerID")
customer_features = pd.get_dummies(customer_features, columns = ['fav_product_category', 'Region'])

customer_features_scaled = StandardScaler().fit_transform(customer_features.drop('CustomerID', axis = 1))
customer_features_scaled_df = pd.DataFrame(customer_features_scaled, columns=customer_features.drop('CustomerID', axis = 1).columns)




In [5]:
# Example similarity calculation (using cosine)
similarity_matrix = cosine_similarity(customer_features_scaled_df)


# Example recommendation function
def get_top_n_similar_customers(customer_id, similarity_matrix, customer_features, n=3):
    customer_idx = customer_features[customer_features['CustomerID'] == customer_id].index[0]
    similarity_scores = list(enumerate(similarity_matrix[customer_idx]))
    similarity_scores = sorted(similarity_scores, key=lambda x: x[1], reverse=True)
    top_n = similarity_scores[1:n+1] # Skip the first one (self)
    top_n_customer_ids_scores = [(customer_features.iloc[index]['CustomerID'], score) for index, score in top_n]
    return top_n_customer_ids_scores


# Applying function to the first 20 customers
first_20_customer_ids = customers_df['CustomerID'].head(20).tolist()

lookalike_results = {}
for customer_id in first_20_customer_ids:
    lookalike_results[customer_id] = get_top_n_similar_customers(customer_id, similarity_matrix, customer_features)

print(lookalike_results)


# Saving results to csv
lookalike_df = pd.DataFrame([{'customer_id': key, 'top_3': value} for key, value in lookalike_results.items()])
lookalike_df.to_csv(f"Shadaab_Ahmad_Lookalike.csv", index = False)

{'C0001': [('C0107', 0.9939105301727779), ('C0190', 0.9874462354968219), ('C0048', 0.9819447871475125)], 'C0002': [('C0128', 0.9582218408211354), ('C0178', 0.9490372318035443), ('C0159', 0.943437184028784)], 'C0003': [('C0052', 0.9935557287245262), ('C0133', 0.9915176693062463), ('C0152', 0.9755687282219333)], 'C0004': [('C0165', 0.9754508146434131), ('C0169', 0.9431381826064418), ('C0126', 0.9412248862272112)], 'C0005': [('C0186', 0.9850689770087325), ('C0146', 0.982210725812471), ('C0007', 0.9647570567298509)], 'C0006': [('C0171', 0.9774738149891585), ('C0187', 0.9732965563279247), ('C0011', 0.9013433119924877)], 'C0007': [('C0140', 0.9752007986943486), ('C0115', 0.9677506316549932), ('C0005', 0.9647570567298509)], 'C0008': [('C0024', 0.9321824011806099), ('C0194', 0.9310843951976049), ('C0122', 0.8642607232529241)], 'C0009': [('C0010', 0.9799081613805347), ('C0062', 0.9589824023928314), ('C0111', 0.9556037463576612)], 'C0010': [('C0111', 0.9888342878817848), ('C0009', 0.979908161380