#Task 2: Lookalike Model

In [None]:
!pip install pandas scikit-learn



In [None]:
import pandas as pd

# Load the uploaded CSV files
customers_path = '/content/Customers.csv'
products_path = '/content/Products.csv'
transactions_path = '/content/Transactions.csv'

customers_df = pd.read_csv(customers_path)
products_df = pd.read_csv(products_path)
transactions_df = pd.read_csv(transactions_path)

# Display the first few rows of each dataset to understand their structure
customers_df.head(), products_df.head(), transactions_df.head()


(  CustomerID        CustomerName         Region  SignupDate
 0      C0001    Lawrence Carroll  South America  2022-07-10
 1      C0002      Elizabeth Lutz           Asia  2022-02-13
 2      C0003      Michael Rivera  South America  2024-03-07
 3      C0004  Kathleen Rodriguez  South America  2022-10-09
 4      C0005         Laura Weber           Asia  2022-08-15,
   ProductID              ProductName     Category   Price
 0      P001     ActiveWear Biography        Books  169.30
 1      P002    ActiveWear Smartwatch  Electronics  346.30
 2      P003  ComfortLiving Biography        Books   44.12
 3      P004            BookWorld Rug   Home Decor   95.69
 4      P005          TechPro T-Shirt     Clothing  429.31,
   TransactionID CustomerID ProductID      TransactionDate  Quantity  \
 0        T00001      C0199      P067  2024-08-25 12:38:23         1   
 1        T00112      C0146      P067  2024-05-27 22:23:54         1   
 2        T00166      C0127      P067  2024-04-25 07:38:55    

In [None]:
# Step 1: Merge datasets to create customer profiles
# Merge transactions with products to include product details in each transaction
transactions_products = transactions_df.merge(products_df, on="ProductID")

# Merge the result with customers to link customer information
customer_profiles = transactions_products.merge(customers_df, on="CustomerID")

# Display the structure of the merged dataset
customer_profiles.head()


Unnamed: 0,TransactionID,CustomerID,ProductID,TransactionDate,Quantity,TotalValue,Price_x,ProductName,Category,Price_y,CustomerName,Region,SignupDate
0,T00001,C0199,P067,2024-08-25 12:38:23,1,300.68,300.68,ComfortLiving Bluetooth Speaker,Electronics,300.68,Andrea Jenkins,Europe,2022-12-03
1,T00112,C0146,P067,2024-05-27 22:23:54,1,300.68,300.68,ComfortLiving Bluetooth Speaker,Electronics,300.68,Brittany Harvey,Asia,2024-09-04
2,T00166,C0127,P067,2024-04-25 07:38:55,1,300.68,300.68,ComfortLiving Bluetooth Speaker,Electronics,300.68,Kathryn Stevens,Europe,2024-04-04
3,T00272,C0087,P067,2024-03-26 22:55:37,2,601.36,300.68,ComfortLiving Bluetooth Speaker,Electronics,300.68,Travis Campbell,South America,2024-04-11
4,T00363,C0070,P067,2024-03-21 15:10:10,3,902.04,300.68,ComfortLiving Bluetooth Speaker,Electronics,300.68,Timothy Perez,Europe,2022-03-15


In [None]:
# Step 2: Create customer profiles by aggregating relevant features
# Group by CustomerID and aggregate features to summarize customer behavior
customer_profiles_agg = customer_profiles.groupby("CustomerID").agg({
    "Category": lambda x: x.value_counts().to_dict(),  # Product preferences by category
    "TotalValue": "sum",  # Total spending
    "Region": "first"  # Region remains constant for a customer
}).reset_index()

# Rename columns for clarity
customer_profiles_agg.rename(columns={
    "Category": "CategoryPreferences",
    "TotalValue": "TotalSpending",
    "Region": "Region"
}, inplace=True)

# Display aggregated customer profiles
customer_profiles_agg.head()


Unnamed: 0,CustomerID,CategoryPreferences,TotalSpending,Region
0,C0001,"{'Electronics': 3, 'Books': 1, 'Home Decor': 1}",3354.52,South America
1,C0002,"{'Home Decor': 2, 'Clothing': 2}",1862.74,Asia
2,C0003,"{'Home Decor': 2, 'Clothing': 1, 'Electronics'...",2725.38,South America
3,C0004,"{'Books': 3, 'Home Decor': 3, 'Electronics': 2}",5354.88,South America
4,C0005,"{'Electronics': 2, 'Home Decor': 1}",2034.24,Asia


In [None]:
import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from scipy.sparse import hstack
import os
# Step 3: Convert customer profiles into feature vectors
def process_preferences(preferences):
    """
    Convert category preferences dictionary to a space-separated string for vectorization.
    Example: {'Electronics': 3, 'Books': 1} -> 'Electronics Electronics Electronics Books'
    """
    return " ".join([f"{k} " * v for k, v in preferences.items()])

# Apply the function to prepare text-based preferences for vectorization
customer_profiles_agg["PreferencesString"] = customer_profiles_agg["CategoryPreferences"].apply(process_preferences)

# Use CountVectorizer to transform preferences into numerical vectors
vectorizer = CountVectorizer()
preference_vectors = vectorizer.fit_transform(customer_profiles_agg["PreferencesString"])

# Include TotalSpending as a numerical feature
spending_vector = customer_profiles_agg["TotalSpending"].values.reshape(-1, 1)

# Combine vectors for similarity calculation (preferences + spending)
from scipy.sparse import hstack
combined_vectors = hstack([preference_vectors, spending_vector])

# Step 4: Compute pairwise cosine similarity
similarity_matrix = cosine_similarity(combined_vectors)

# Extract the first 20 customers and compute their top 3 similar customers
top_20_customers = customer_profiles_agg["CustomerID"][:20]
lookalike_map = {}

for idx, customer_id in enumerate(top_20_customers):
    # Get similarity scores for the current customer
    customer_similarities = similarity_matrix[idx]

    # Get the top 3 similar customers (excluding the customer themselves)
    top_indices = customer_similarities.argsort()[-4:-1][::-1]  # Top 3 excluding self
    similar_customers = [
        (customer_profiles_agg["CustomerID"].iloc[i], customer_similarities[i]) for i in top_indices
    ]

    # Store the results in the map
    lookalike_map[customer_id] = similar_customers

# Convert the lookalike map to a DataFrame for saving as CSV
lookalike_df = pd.DataFrame({
    "CustomerID": lookalike_map.keys(),
    "Lookalikes": [str(v) for v in lookalike_map.values()]
})

# Save the results to a CSV file
lookalike_df.to_csv('lookalike.csv', index=False)


In [None]:
df2=pd.read_csv('/content/lookalike.csv')
df2


Unnamed: 0,CustomerID,Lookalikes
0,C0001,"[('C0035', 0.9999999809217808), ('C0146', 0.99..."
1,C0002,"[('C0134', 0.9999998918701117), ('C0144', 0.99..."
2,C0003,"[('C0166', 0.9999999880837566), ('C0195', 0.99..."
3,C0004,"[('C0113', 0.9999999630699336), ('C0081', 0.99..."
4,C0005,"[('C0197', 0.9999999978270273), ('C0007', 0.99..."
5,C0006,"[('C0185', 0.9999999789912387), ('C0082', 0.99..."
6,C0007,"[('C0005', 0.999999967576945), ('C0140', 0.999..."
7,C0008,"[('C0031', 0.9999999476879287), ('C0166', 0.99..."
8,C0009,"[('C0049', 0.9999997638080255), ('C0058', 0.99..."
9,C0010,"[('C0150', 0.9999998302305604), ('C0034', 0.99..."
