<a href="https://colab.research.google.com/github/SOBIKA-G/Data-science-assignment/blob/main/Sobika_G_Lookalike.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
# Import required libraries
import pandas as pd
from sklearn.preprocessing import StandardScaler
from sklearn.metrics.pairwise import cosine_similarity

# Load the datasets
customers = pd.read_csv('/content/drive/MyDrive/Customers.csv')
products = pd.read_csv('/content/drive/MyDrive/Products (1).csv')
transactions = pd.read_csv('/content/drive/MyDrive/Transactions (1).csv')

# Clean the column names in transactions dataset
transactions.columns = transactions.columns.str.strip()  # Strip extra spaces
transactions.rename(columns={'Category_x': 'Category', 'Category_y': 'DuplicateCategory'}, inplace=True)

# Merge products with transactions to get product category in the transaction data
transactions = pd.merge(transactions, products[['ProductID', 'Category']], on='ProductID', how='left')

# Aggregate transaction data by CustomerID and Product Category (TotalValue is used as the aggregated value)
customer_transaction = transactions.groupby(['CustomerID', 'Category'])['TotalValue'].sum().unstack(fill_value=0)

# Create customer demographic features (CustomerID, Region, SignupDate)
customer_demo = customers[['CustomerID', 'Region']].copy()

# Convert SignupDate to the number of days since signup
customer_demo['SignupDays'] = (pd.to_datetime('today') - pd.to_datetime(customers['SignupDate'])).dt.days

# Merge the demographic data with transaction features
customer_data = pd.merge(customer_demo, customer_transaction, on='CustomerID', how='left')

# Handle missing values by filling them with 0
customer_data = customer_data.fillna(0)

# Normalize the numerical features (e.g., TotalValue for each category)
scaler = StandardScaler()
transaction_columns = customer_transaction.columns
customer_data[transaction_columns] = scaler.fit_transform(customer_data[transaction_columns])

# Now we have customer features combining both demographics and transaction history

# Similarity Calculation
customer_feature_matrix = customer_data.drop(columns=['CustomerID', 'Region', 'SignupDays']).values
similarity_matrix = cosine_similarity(customer_feature_matrix)

# Get top 3 most similar customers for the first 20 customers (C0001 - C0020)
lookalike_list = []
for i in range(20):
    customer_id = customer_data['CustomerID'].iloc[i]
    similarities = similarity_matrix[i]

    # Exclude the current customer (to avoid self-matching)
    similar_customers = [(customer_data['CustomerID'].iloc[j], similarities[j]) for j in range(len(similarities)) if j != i]

    # Sort by similarity score and select top 3 similar customers
    similar_customers_sorted = sorted(similar_customers, key=lambda x: x[1], reverse=True)[:3]

    # Store the results
    for similar_customer in similar_customers_sorted:
        lookalike_list.append({
            "CustomerID": customer_id,
            "Lookalike_CustomerID": similar_customer[0],
            "Similarity_Score": similar_customer[1]
        })

# Convert the results to a DataFrame
lookalike_df = pd.DataFrame(lookalike_list)

# Save the output to a CSV file
lookalike_df.to_csv("/content/Lookalike.csv", index=False)

# Show the result for C0001 to C0020
lookalike_df.head(20)


Unnamed: 0,CustomerID,Lookalike_CustomerID,Similarity_Score
0,C0001,C0091,0.988881
1,C0001,C0069,0.984308
2,C0001,C0184,0.978609
3,C0002,C0159,0.979384
4,C0002,C0036,0.956507
5,C0002,C0134,0.907855
6,C0003,C0007,0.99686
7,C0003,C0085,0.964024
8,C0003,C0166,0.960495
9,C0004,C0075,0.983289
