In [1]:
from google.colab import files
uploaded = files.upload()  # This will prompt a file upload dialog


Saving Customers.csv to Customers.csv
Saving Products.csv to Products.csv
Saving Transactions.csv to Transactions.csv


In [3]:
import pandas as pd

In [6]:
customers = pd.read_csv('Customers.csv')
products = pd.read_csv('Products.csv')
transactions = pd.read_csv('Transactions.csv')

merged_data = transactions.merge(customers, on='CustomerID').merge(products, on='ProductID')


In [7]:
print(merged_data.columns)


Index(['TransactionID', 'CustomerID', 'ProductID', 'TransactionDate',
       'Quantity', 'TotalValue', 'Price_x', 'CustomerName', 'Region',
       'SignupDate', 'ProductName', 'Category', 'Price_y'],
      dtype='object')


In [8]:
customer_features = merged_data.groupby('CustomerID').agg({
    'TotalValue': 'sum',
    'TransactionID': 'count',
    'Category': lambda x: x.mode()[0],
    'Price_y': 'mean'  # Use Price_y as the average price column
}).reset_index()

customer_features.rename(columns={
    'TransactionID': 'TransactionCount',
    'Price_y': 'AvgPrice'  # Rename Price_y to AvgPrice
}, inplace=True)


In [9]:
print(customer_features.head())


  CustomerID  TotalValue  TransactionCount     Category    AvgPrice
0      C0001     3354.52                 5  Electronics  278.334000
1      C0002     1862.74                 4     Clothing  208.920000
2      C0003     2725.38                 4   Home Decor  195.707500
3      C0004     5354.88                 8        Books  240.636250
4      C0005     2034.24                 3  Electronics  291.603333


In [10]:
merged_data = merged_data.drop(columns=['Price_x'])


In [12]:
numeric_features = customer_features.select_dtypes(include=['number'])


In [13]:
from sklearn.preprocessing import MinMaxScaler

scaler = MinMaxScaler()
scaled_features = scaler.fit_transform(numeric_features)


In [14]:
scaled_features_df = pd.DataFrame(scaled_features, columns=numeric_features.columns)
final_features = pd.concat([customer_features[['CustomerID', 'Category']], scaled_features_df], axis=1)


In [15]:
from sklearn.preprocessing import LabelEncoder

encoder = LabelEncoder()
customer_features['Category_Encoded'] = encoder.fit_transform(customer_features['Category'])


In [16]:
customer_features = pd.get_dummies(customer_features, columns=['Category'], drop_first=True)


In [17]:
from sklearn.preprocessing import MinMaxScaler

scaler = MinMaxScaler()
scaled_features = scaler.fit_transform(customer_features.iloc[:, 1:])


**Compute Similarity**

In [18]:
from sklearn.metrics.pairwise import cosine_similarity

similarity_matrix = cosine_similarity(scaled_features)


In [19]:
lookalikes = {}
for idx, row in enumerate(similarity_matrix):
    similar_customers = sorted(
        [(i, score) for i, score in enumerate(row) if i != idx],
        key=lambda x: x[1],
        reverse=True
    )[:3]
    lookalikes[customer_features.iloc[idx]['CustomerID']] = [
        (customer_features.iloc[i]['CustomerID'], round(score, 2))
        for i, score in similar_customers
    ]


In [21]:
lookalike_list = []
for cust_id, lookalike_data in lookalikes.items():
    lookalike_list.append({
        'CustomerID': cust_id,
        'Lookalikes': lookalike_data
    })

import csv
with open('Sudhashini_Enugula_Lookalike.csv', 'w', newline='') as file:
    writer = csv.DictWriter(file, fieldnames=['CustomerID', 'Lookalikes'])
    writer.writeheader()
    writer.writerows(lookalike_list)
