In [4]:
import numpy as np
import pandas as pd
from sklearn.cluster import KMeans
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.preprocessing import StandardScaler

# Mount Google Drive
def mount_drive():
    from google.colab import drive
    drive.mount('/content/gdrive')

# Load datasets
mount_drive()
customers = pd.read_csv('gdrive/My Drive/Zeotap/Customers.csv')
products = pd.read_csv('gdrive/My Drive/Zeotap/Products.csv')
transactions = pd.read_csv('gdrive/My Drive/Zeotap/Transactions.csv')

# Preprocessing
transactions['TransactionDate'] = pd.to_datetime(transactions['TransactionDate'])

# Merge data
merged_data = pd.merge(transactions, customers, on='CustomerID')
merged_data = pd.merge(merged_data, products, on='ProductID')

# Feature engineering
customer_features = merged_data.groupby('CustomerID').agg(
    total_spend=('TotalValue', 'sum'),
    total_transactions=('TransactionID', 'count'),
    avg_transaction_value=('TotalValue', 'mean'),
    unique_products=('ProductID', 'nunique')
).reset_index()

# Clustering
scaler = StandardScaler()
X = scaler.fit_transform(customer_features.iloc[:, 1:])
kmeans = KMeans(n_clusters=5, random_state=42)
customer_features['cluster'] = kmeans.fit_predict(X)

# Lookalike model
def get_lookalikes(customer_id, data, top_n=3):
    customer_vector = data.loc[data['CustomerID'] == customer_id, 'total_spend':'unique_products'].values
    all_vectors = data.loc[:, 'total_spend':'unique_products'].values
    similarities = cosine_similarity(customer_vector, all_vectors).flatten()
    data['similarity'] = similarities
    top_customers = data[data['CustomerID'] != customer_id].nlargest(top_n, 'similarity')[['CustomerID', 'similarity']]
    return top_customers

# Get top 3 lookalikes for first 20 customers
lookalikes = {}
for customer_id in customers['CustomerID'][:20]:
    similar_customers = get_lookalikes(customer_id, customer_features)
    lookalikes[customer_id] = similar_customers.values.tolist()

# Save lookalikes to CSV
lookalike_data = []
for customer_id, similar_list in lookalikes.items():
    entry = [customer_id]
    for similar_cust_id, score in similar_list:
        entry.extend([similar_cust_id, score])
    lookalike_data.append(entry)

# Adjust columns to match the data structure dynamically
max_cols = max(len(row) for row in lookalike_data)
columns = ['cust_id'] + [f'similar_customer_{i // 2 + 1}' if i % 2 == 0 else f'score_{i // 2 + 1}' for i in range(max_cols - 1)]
lookalike_df = pd.DataFrame(lookalike_data, columns=columns)
lookalike_df.to_csv('gdrive/My Drive/Zeotap/Lookalike.csv', index=False)

# Save final notebook with results
print("Lookalike model completed. Results saved to 'Lookalike.csv'.")


Drive already mounted at /content/gdrive; to attempt to forcibly remount, call drive.mount("/content/gdrive", force_remount=True).
Lookalike model completed. Results saved to 'Lookalike.csv'.
