<a href="https://colab.research.google.com/github/Tousif-Rehman/EDA_Assignment/blob/main/Tousif_Rehman_Lookalike.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [4]:
import pandas as pd
from sklearn.preprocessing import StandardScaler
from sklearn.neighbors import NearestNeighbors

# Load datasets
customers = pd.read_csv('Customers.csv')
products = pd.read_csv('Products.csv')
transactions = pd.read_csv('Transactions.csv')

# Merge datasets
merged = pd.merge(transactions, customers, on='CustomerID')
final_data = pd.merge(merged, products, on='ProductID')

In [5]:
customer_features['SignupDate'] = pd.to_datetime(customer_features['SignupDate'])
customer_features['TimeSinceSignup'] = (customer_features['TransactionDate'] - customer_features['SignupDate']).dt.days

In [8]:
# Convert SignupDate to datetime
customer_features['SignupDate'] = pd.to_datetime(customer_features['SignupDate'], errors='coerce')

# Now calculate the TimeSinceSignup
customer_features['TimeSinceSignup'] = (customer_features['TransactionDate'] - customer_features['SignupDate']).dt.days


In [10]:
# 1. Total spending per customer
total_spending = final_data.groupby('CustomerID')['TotalValue'].sum().reset_index(name='TotalSpending')

# 2. Average transaction value
avg_transaction = final_data.groupby('CustomerID')['TotalValue'].mean().reset_index(name='AvgTransactionValue')

# 3. Number of transactions
transaction_count = final_data.groupby('CustomerID')['TransactionID'].count().reset_index(name='TransactionCount')

# 4. Product category preferences (percentage per category)
category_pivot = pd.pivot_table(
    final_data,
    index='CustomerID',
    columns='Category',
    values='TransactionID',
    aggfunc='count',
    fill_value=0
)
category_percent = category_pivot.div(category_pivot.sum(axis=1), axis=0)

# 5. Region (one-hot encoding)
region_encoded = pd.get_dummies(customers[['CustomerID', 'Region']], columns=['Region'])

# 6. Time since signup (days from SignupDate to last TransactionDate)
final_data['SignupDate'] = pd.to_datetime(final_data['SignupDate'])
final_data['TransactionDate'] = pd.to_datetime(final_data['TransactionDate'])
last_transaction = final_data.groupby('CustomerID')['TransactionDate'].max().reset_index()
customer_features = pd.merge(customers, last_transaction, on='CustomerID')
customer_features['SignupDate'] = pd.to_datetime(customer_features['SignupDate'])  # Ensure correct type
customer_features['TimeSinceSignup'] = (customer_features['TransactionDate'] - customer_features['SignupDate']).dt.days
time_since_signup = customer_features[['CustomerID', 'TimeSinceSignup']]

# Combine all features
features = pd.merge(total_spending, avg_transaction, on='CustomerID')
features = pd.merge(features, transaction_count, on='CustomerID')
features = pd.merge(features, region_encoded, on='CustomerID')
features = pd.merge(features, category_percent.reset_index(), on='CustomerID')
features = pd.merge(features, time_since_signup, on='CustomerID')

# Set CustomerID as index
features.set_index('CustomerID', inplace=True)


In [11]:
scaler = StandardScaler()
features_scaled = scaler.fit_transform(features)

In [12]:
model = NearestNeighbors(n_neighbors=4, metric='euclidean')  # 4 to exclude the customer itself
model.fit(features_scaled)

In [13]:
# Get the first 20 customers (C0001 to C0020)
target_customers = [f'C{i:04d}' for i in range(1, 21)]
lookalike_map = {}

for customer_id in target_customers:
    if customer_id not in features.index:
        continue  # Skip if customer not found

    # Find neighbors
    customer_index = features.index.get_loc(customer_id)
    distances, indices = model.kneighbors([features_scaled[customer_index]])

    # Exclude the customer itself and get top 3
    neighbor_indices = indices[0][1:4]  # Skip the first (self)
    neighbor_ids = features.iloc[neighbor_indices].index.tolist()
    neighbor_scores = [1 / (d + 1e-6) for d in distances[0][1:4]]  # Convert distance to similarity score

    # Store results
    lookalike_map[customer_id] = list(zip(neighbor_ids, neighbor_scores))

In [14]:
# Create CSV
import csv

with open('Lookalike.csv', 'w', newline='') as f:
    writer = csv.writer(f)
    writer.writerow(['CustomerID', 'Lookalike1', 'Score1', 'Lookalike2', 'Score2', 'Lookalike3', 'Score3'])

    for customer_id, neighbors in lookalike_map.items():
        row = [customer_id]
        for neighbor in neighbors:
            row.extend(neighbor)
        writer.writerow(row)

In [16]:
import pandas as pd

# Read the CSV file
lookalike_df = pd.read_csv('Lookalike.csv')

# Display the DataFrame
print("Lookalike Recommendations:")
display(lookalike_df)

Lookalike Recommendations:


Unnamed: 0,CustomerID,Lookalike1,Score1,Lookalike2,Score2,Lookalike3,Score3
0,C0001,C0112,0.61281,C0091,0.564969,C0118,0.557274
1,C0002,C0134,1.074664,C0159,0.646691,C0106,0.63703
2,C0003,C0031,1.147614,C0129,0.92615,C0195,0.761311
3,C0004,C0113,1.053361,C0104,0.641602,C0102,0.542648
4,C0005,C0007,1.21472,C0140,0.532871,C0186,0.40563
5,C0006,C0187,0.796131,C0126,0.458498,C0169,0.454326
6,C0007,C0005,1.21472,C0140,0.638771,C0186,0.369345
7,C0008,C0098,0.580419,C0194,0.535652,C0024,0.508896
8,C0009,C0198,0.482093,C0010,0.440393,C0062,0.4251
9,C0010,C0061,0.70539,C0062,0.505166,C0009,0.440393
