In [2]:
!pip install pandas scikit-learn matplotlib





[notice] A new release of pip is available: 24.0 -> 25.0
[notice] To update, run: python.exe -m pip install --upgrade pip


In [3]:
# Step 1: Import libraries
import pandas as pd
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.preprocessing import StandardScaler
import matplotlib.pyplot as plt

# Step 2: Load datasets
customers = pd.read_csv('Customers.csv')
products = pd.read_csv('Products.csv')
transactions = pd.read_csv('Transactions.csv')

# Step 3: Merge datasets for feature extraction
transactions_customers = pd.merge(transactions, customers, on='CustomerID', how='left')
full_data = pd.merge(transactions_customers, products, on='ProductID', how='left')

# Step 4: Feature engineering
customer_features = full_data.groupby('CustomerID').agg({
    'TotalValue': 'sum',  # Total spending
    'Quantity': 'sum',    # Total quantity purchased
    'Category': lambda x: x.mode()[0],  # Most purchased category
    'Region': 'first'     # Region of the customer
}).reset_index()

# One-hot encode categorical fields
customer_features = pd.get_dummies(customer_features, columns=['Category', 'Region'])

# Step 5: Normalize data
scaler = StandardScaler()
numeric_features = ['TotalValue', 'Quantity']
scaled_features = scaler.fit_transform(customer_features[numeric_features])
scaled_data = pd.concat([
    customer_features[['CustomerID']],
    pd.DataFrame(scaled_features, columns=numeric_features, index=customer_features.index),
    customer_features.drop(['CustomerID'] + numeric_features, axis=1)
], axis=1)

# Step 6: Compute similarity matrix
similarity_matrix = cosine_similarity(scaled_data.drop('CustomerID', axis=1))

# Step 7: Generate lookalike recommendations
lookalike_results = {}
for idx, customer_id in enumerate(customer_features['CustomerID']):
    customer_similarities = list(enumerate(similarity_matrix[idx]))
    customer_similarities = sorted(customer_similarities, key=lambda x: x[1], reverse=True)[1:4]
    lookalike_results[customer_id] = [(customer_features['CustomerID'][sim[0]], round(sim[1], 3)) for sim in customer_similarities]

# Step 8: Filter for the first 20 customers
lookalike_filtered = {k: v for k, v in lookalike_results.items() if k in customers['CustomerID'][:20].values}

# Step 9: Save results
lookalike_df = pd.DataFrame({'CustomerID': lookalike_filtered.keys(), 'Lookalikes': lookalike_filtered.values()})
lookalike_df.to_csv('Lookalike.csv', index=False)
print("Lookalike.csv has been saved.")


Lookalike.csv has been saved.
