In [7]:
# Required Libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.preprocessing import StandardScaler, LabelEncoder

In [8]:
# Mounting the drive
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [9]:
customers = pd.read_csv('/content/drive/MyDrive/Customers.csv')
products  = pd.read_csv('/content/drive/MyDrive/Products.csv')
transactions = pd.read_csv('/content/drive/MyDrive/Transactions.csv')

In [10]:
# Merge data for EDA
merged_data = pd.merge(transactions, customers, on="CustomerID")
merged_data = pd.merge(merged_data, products, on="ProductID")

In [11]:
merged_data.drop(columns = 'Price_y',inplace = True)

In [12]:
merged_data.rename(columns = {'Price_x':'Price_of_1'}, inplace = True)

In [13]:
merged_data['TransactionDate'] = pd.to_datetime(merged_data['TransactionDate'])
merged_data['SignupDate'] = pd.to_datetime(merged_data['SignupDate'])

## Feature Engineering of Lookalike Model

# Combine features like region, product category, total transaction value, and quantity for each customer.

In [14]:
customer_features = merged_data.groupby("CustomerID").agg({
    "Region": "first",  # Region where the customer resides
    "Category": lambda x: x.mode()[0],  # Most frequently purchased product category
    "TotalValue": "sum",  # Total value of transactions by the customer
    "Quantity": "sum"  # Total quantity purchased by the customer
}).reset_index()

In [15]:
customer_features

Unnamed: 0,CustomerID,Region,Category,TotalValue,Quantity
0,C0001,South America,Electronics,3354.52,12
1,C0002,Asia,Clothing,1862.74,10
2,C0003,South America,Home Decor,2725.38,14
3,C0004,South America,Books,5354.88,23
4,C0005,Asia,Electronics,2034.24,7
...,...,...,...,...,...
194,C0196,Europe,Home Decor,4982.88,12
195,C0197,Europe,Electronics,1928.65,9
196,C0198,Europe,Clothing,931.83,3
197,C0199,Europe,Electronics,1979.28,9


## Encoding categorical data
# Convert Region and Category into numerical values for similarity calculations.

In [16]:
encoder = LabelEncoder()
customer_features["Region"] = encoder.fit_transform(customer_features["Region"])
customer_features["Category"] = encoder.fit_transform(customer_features["Category"])

In [17]:
customer_features

Unnamed: 0,CustomerID,Region,Category,TotalValue,Quantity
0,C0001,3,2,3354.52,12
1,C0002,0,1,1862.74,10
2,C0003,3,3,2725.38,14
3,C0004,3,0,5354.88,23
4,C0005,0,2,2034.24,7
...,...,...,...,...,...
194,C0196,1,3,4982.88,12
195,C0197,1,2,1928.65,9
196,C0198,1,1,931.83,3
197,C0199,1,2,1979.28,9


## # Compute Similarities
# Create a matrix of customer features for similarity calculation.

In [18]:
features = customer_features.drop("CustomerID", axis=1)
similarity_matrix = cosine_similarity(features)

# Recommend Top 3 Similar Customers
# For the first 20 customers, find the 3 most similar customers based on the similarity matrix.

In [19]:
lookalike_results = {}
for i, customer_id in enumerate(customer_features["CustomerID"][:20]):
    # Sort similarity scores in descending order and select the top 3 similar customers.
    similar_indices = np.argsort(similarity_matrix[i])[-4:-1][::-1]
    similar_customers = customer_features.iloc[similar_indices]
    scores = similarity_matrix[i][similar_indices]
    lookalike_results[customer_id] = list(zip(similar_customers["CustomerID"], scores))

In [20]:
lookalike_results

{'C0001': [('C0012', 0.9999999467743711),
  ('C0184', 0.9999999436576429),
  ('C0035', 0.9999999357476206)],
 'C0002': [('C0136', 0.9999998701279131),
  ('C0043', 0.9999998537980136),
  ('C0134', 0.9999998046317713)],
 'C0003': [('C0157', 0.9999999801113866),
  ('C0038', 0.9999998853664823),
  ('C0076', 0.9999998766748611)],
 'C0004': [('C0132', 0.9999999994546347),
  ('C0169', 0.9999999880449344),
  ('C0165', 0.9999999779396626)],
 'C0005': [('C0178', 0.9999999852368974),
  ('C0164', 0.9999999270021319),
  ('C0146', 0.9999999247562563)],
 'C0006': [('C0126', 0.9999999985805198),
  ('C0171', 0.9999999890444783),
  ('C0118', 0.9999999839293631)],
 'C0007': [('C0146', 0.9999999999371122),
  ('C0138', 0.9999999685692955),
  ('C0170', 0.9999999637396333)],
 'C0008': [('C0113', 0.9999999787631823),
  ('C0062', 0.999999966219679),
  ('C0124', 0.9999999470387895)],
 'C0009': [('C0198', 0.9999999901629574),
  ('C0120', 0.999999904143617),
  ('C0152', 0.9999998701932185)],
 'C0010': [('C0049', 

In [21]:
# Create a DataFrame to store the lookalike results and save it to a CSV file.
lookalike_df = pd.DataFrame.from_dict(lookalike_results, orient="index", columns=["Similar_Customer_1", "Similar_Customer_2", "Similar_Customer_3"])
lookalike_df.to_csv("FirstName_LastName_Lookalike.csv")

In [22]:
lookalike_df

Unnamed: 0,Similar_Customer_1,Similar_Customer_2,Similar_Customer_3
C0001,"(C0012, 0.9999999467743711)","(C0184, 0.9999999436576429)","(C0035, 0.9999999357476206)"
C0002,"(C0136, 0.9999998701279131)","(C0043, 0.9999998537980136)","(C0134, 0.9999998046317713)"
C0003,"(C0157, 0.9999999801113866)","(C0038, 0.9999998853664823)","(C0076, 0.9999998766748611)"
C0004,"(C0132, 0.9999999994546347)","(C0169, 0.9999999880449344)","(C0165, 0.9999999779396626)"
C0005,"(C0178, 0.9999999852368974)","(C0164, 0.9999999270021319)","(C0146, 0.9999999247562563)"
C0006,"(C0126, 0.9999999985805198)","(C0171, 0.9999999890444783)","(C0118, 0.9999999839293631)"
C0007,"(C0146, 0.9999999999371122)","(C0138, 0.9999999685692955)","(C0170, 0.9999999637396333)"
C0008,"(C0113, 0.9999999787631823)","(C0062, 0.999999966219679)","(C0124, 0.9999999470387895)"
C0009,"(C0198, 0.9999999901629574)","(C0120, 0.999999904143617)","(C0152, 0.9999998701932185)"
C0010,"(C0049, 0.9999998859969678)","(C0091, 0.999999741004611)","(C0111, 0.999999364161573)"


# Explanation:
# I aggregated transaction and customer data to extract meaningful features for each customer.
# The cosine similarity was used to measure how close two customers are in terms of their purchase behavior and profile.
# Top 3 similar customers were identified for the first 20 customers, and results were saved in a structured format.
