# Importing the required Packages :

In [1]:
import pandas as pd
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity

# Load the datasets
customers = pd.read_csv('/content/Customers.csv')
products = pd.read_csv('/content/Products.csv')
transactions = pd.read_csv('/content/Transactions.csv')


In [9]:
# Merge Transactions with Customers to get customer information
transactions_with_customers = pd.merge(transactions, customers, on='CustomerID', how='inner')

# Merge the result with Products to get product information
full_data = pd.merge(transactions_with_customers, products, on='ProductID', how='inner')

# Check the structure of merged data
full_data.head()

Unnamed: 0,TransactionID,CustomerID,ProductID,TransactionDate,Quantity,TotalValue,Price_x,CustomerName,Region,SignupDate,ProductName,Category,Price_y
0,T00001,C0199,P067,2024-08-25 12:38:23,1,300.68,300.68,Andrea Jenkins,Europe,2022-12-03,ComfortLiving Bluetooth Speaker,Electronics,300.68
1,T00112,C0146,P067,2024-05-27 22:23:54,1,300.68,300.68,Brittany Harvey,Asia,2024-09-04,ComfortLiving Bluetooth Speaker,Electronics,300.68
2,T00166,C0127,P067,2024-04-25 07:38:55,1,300.68,300.68,Kathryn Stevens,Europe,2024-04-04,ComfortLiving Bluetooth Speaker,Electronics,300.68
3,T00272,C0087,P067,2024-03-26 22:55:37,2,601.36,300.68,Travis Campbell,South America,2024-04-11,ComfortLiving Bluetooth Speaker,Electronics,300.68
4,T00363,C0070,P067,2024-03-21 15:10:10,3,902.04,300.68,Timothy Perez,Europe,2022-03-15,ComfortLiving Bluetooth Speaker,Electronics,300.68


In [10]:
# Aggregate data to get customer-level features
customer_features = full_data.groupby('CustomerID').agg(
    total_spent=('TotalValue', 'sum'),
    total_quantity=('Quantity', 'sum'),
    unique_products=('ProductID', 'nunique'),
    category_preference=('Category', lambda x: x.mode()[0])  # Most frequent category purchased
).reset_index()

# Check the customer features
customer_features.head()

Unnamed: 0,CustomerID,total_spent,total_quantity,unique_products,category_preference
0,C0001,3354.52,12,5,Electronics
1,C0002,1862.74,10,4,Clothing
2,C0003,2725.38,14,4,Home Decor
3,C0004,5354.88,23,8,Books
4,C0005,2034.24,7,3,Electronics


In [11]:
from sklearn.preprocessing import StandardScaler

# Normalize the customer features
scaler = StandardScaler()
scaled_features = scaler.fit_transform(customer_features[['total_spent', 'total_quantity', 'unique_products']])

# Add the category preference (categorical data)
category_mapping = pd.get_dummies(customer_features['category_preference'])
final_features = np.hstack([scaled_features, category_mapping])

# Check the final features
final_features.shape


(199, 7)

In [12]:
# Calculate cosine similarity between customers
similarity_matrix = cosine_similarity(final_features)

# Check the similarity between the first 2 customers
similarity_matrix[0, 1]


0.05910351612044419

In [13]:
# Create an empty dictionary to store the top 3 lookalikes for each customer
lookalike_dict = {}

# For each customer (C0001 to C0020)
for customer_index in range(20):
    # Get similarity scores for the customer
    similarity_scores = similarity_matrix[customer_index]

    # Get the indices of the top 3 most similar customers (excluding the customer themselves)
    similar_customers = np.argsort(similarity_scores)[-4:-1]  # Exclude the customer itself

    # Get the similarity scores for the top 3 similar customers
    top_3_similarities = [(customers.iloc[i]['CustomerID'], similarity_scores[i]) for i in similar_customers]

    # Store the result
    lookalike_dict[customers.iloc[customer_index]['CustomerID']] = top_3_similarities

# Check the lookalike dictionary
lookalike_dict

{'C0001': [('C0048', 0.9647721266176639),
  ('C0055', 0.9668478142511668),
  ('C0072', 0.9695262226511742)],
 'C0002': [('C0077', 0.9579502980875815),
  ('C0010', 0.9736328913467542),
  ('C0029', 0.99974512639411)],
 'C0003': [('C0038', 0.9048525280750008),
  ('C0160', 0.9105142193604102),
  ('C0166', 0.9119503204870606)],
 'C0004': [('C0175', 0.9882273117204532),
  ('C0017', 0.9905342436124039),
  ('C0075', 0.992734056440877)],
 'C0005': [('C0007', 0.9866014226650415),
  ('C0112', 0.9961334592706172),
  ('C0185', 0.9974522198413518)],
 'C0006': [('C0064', 0.8991275884317672),
  ('C0168', 0.9530950775029938),
  ('C0117', 0.9959643619634818)],
 'C0007': [('C0005', 0.9866014226650415),
  ('C0050', 0.9866303905885182),
  ('C0120', 0.9946163588989675)],
 'C0008': [('C0113', 0.8660678981592065),
  ('C0065', 0.8723086704562149),
  ('C0124', 0.9219221140098036)],
 'C0009': [('C0058', 0.9724329814787537),
  ('C0197', 0.9862633862100093),
  ('C0083', 0.9963661624596838)],
 'C0010': [('C0176', 0

In [14]:
# Convert the lookalike_dict to a DataFrame
lookalike_data = []
for customer_id, similar_customers in lookalike_dict.items():
    for similar_customer_id, score in similar_customers:
        lookalike_data.append([customer_id, similar_customer_id, score])

# Create DataFrame and save to CSV
lookalike_df = pd.DataFrame(lookalike_data, columns=['CustomerID', 'LookalikeCustomerID', 'SimilarityScore'])
lookalike_df.to_csv('Lookalike.csv', index=False)

# Check the saved lookalike data
lookalike_df.head()

Unnamed: 0,CustomerID,LookalikeCustomerID,SimilarityScore
0,C0001,C0048,0.964772
1,C0001,C0055,0.966848
2,C0001,C0072,0.969526
3,C0002,C0077,0.95795
4,C0002,C0010,0.973633
