In [13]:
# !pip3 install scikit-learn
import pandas as pd
from sklearn.preprocessing import StandardScaler
from sklearn.metrics.pairwise import cosine_similarity

In [14]:
customers = pd.read_csv('Customers.csv')
products = pd.read_csv('Products.csv')
transactions = pd.read_csv('Transactions.csv')

In [15]:
customers['SignupDate'] = pd.to_datetime(customers['SignupDate'])
transactions['TransactionDate'] = pd.to_datetime(transactions['TransactionDate'])

In [17]:
merged_data = transactions.merge(customers, on='CustomerID', how='left')
merged_data = merged_data.merge(products, on='ProductID', how='left')

In [18]:
customer_features = merged_data.groupby('CustomerID').agg({
    'TotalValue': 'sum',  # Total purchase value
    'Quantity': 'sum',    # Total quantity purchased
    'Price_x': 'mean',      # Average product price
    'ProductID': 'nunique',  # Number of unique products purchased
    'Region': 'first',    # Region (categorical)
    'SignupDate': 'first' # Signup date
}).reset_index()


In [19]:
customer_features = pd.get_dummies(customer_features, columns=['Region'], drop_first=True)

In [20]:
customer_features['SignupDays'] = (customer_features['SignupDate'].max() - customer_features['SignupDate']).dt.days
customer_features.drop(columns=['SignupDate'], inplace=True)

In [21]:
scaler = StandardScaler()
numeric_cols = ['TotalValue', 'Quantity', 'Price_x', 'ProductID', 'SignupDays']
customer_features[numeric_cols] = scaler.fit_transform(customer_features[numeric_cols])

In [22]:
similarity_matrix = cosine_similarity(customer_features[numeric_cols])
similarity_df = pd.DataFrame(similarity_matrix, index=customer_features['CustomerID'], columns=customer_features['CustomerID'])


In [23]:
lookalike_map = {}
for cust_id in customer_features['CustomerID']:
    similar_customers = similarity_df[cust_id].sort_values(ascending=False).iloc[1:4]
    lookalike_map[cust_id] = list(zip(similar_customers.index, similar_customers.values))

In [24]:
first_20_customers = customer_features['CustomerID'].head(20)
lookalike_results = {cust_id: lookalike_map[cust_id] for cust_id in first_20_customers}

In [26]:
lookalike_list = []
for cust_id, lookalikes in lookalike_results.items():
    for similar_cust_id, score in lookalikes:
        lookalike_list.append({'cust_id': cust_id, 'similar_cust_id': similar_cust_id, 'score': score})

lookalike_df = pd.DataFrame(lookalike_list)
lookalike_df.to_csv('Anuneet_Rastogi_Lookalike.csv', index=False)

# Summary
print("Lookalike model completed. Results saved to 'Lookalike.csv'.")

Lookalike model completed. Results saved to 'Lookalike.csv'.
