# Task 2: Lookalike Model

Step 1: Load and Preprocess Data

In [8]:
import pandas as pd
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.preprocessing import StandardScaler

In [9]:
#Load datasets
customers = pd.read_csv('Customers.csv')
transactions = pd.read_csv('Transactions.csv')
products = pd.read_csv('Products.csv')

Step 2: Merge datasets

In [10]:
merged_data = transactions.merge(products, on='ProductID', how='left')
merged_data = merged_data.merge(customers, on='CustomerID', how='left')

Step 3: Feature Engineering

In [13]:
# Aggregate features per customer
customer_features = merged_data.groupby('CustomerID').agg({
    'TotalValue': ['sum', 'mean'],  # Total and average spend
    'Category': lambda x: x.value_counts().idxmax(),  # Most purchased category
}).reset_index()

# Flatten MultiIndex columns
customer_features.columns = ['CustomerID', 'TotalSpend', 'AvgSpend', 'TopCategory']

# Merge with customer profile
customer_features = customer_features.merge(customers, on='CustomerID', how='left')

Step 4: Calculate Similarity

In [14]:
print("Columns in customer_features:", customer_features.columns)

# Strip spaces from column names
customer_features.columns = customer_features.columns.str.strip()

# Ensure 'Region' and 'TopCategory' columns are available
if 'Region' in customer_features.columns and 'TopCategory' in customer_features.columns:
    print("'Region' and 'TopCategory' columns found.")
else:
    print("One or both of 'Region' and 'TopCategory' columns are missing. Please check the dataset.")

Columns in customer_features: Index(['CustomerID', 'TotalSpend', 'AvgSpend', 'TopCategory', 'CustomerName',
       'Region', 'SignupDate'],
      dtype='object')
'Region' and 'TopCategory' columns found.


Step 5: Generate Recommendations

In [15]:
#Encode categorical variables (e.g., Region, TopCategory)
categorical_cols = ['Region', 'TopCategory']
customer_features = pd.get_dummies(customer_features, columns=categorical_cols)

Step 6: Standardize numeric columns

In [16]:
scaler = StandardScaler()
numeric_cols = ['TotalSpend', 'AvgSpend']
customer_features[numeric_cols] = scaler.fit_transform(customer_features[numeric_cols])

Step 7: Calculate Similarity

In [18]:
# Check the data types of the columns to confirm they are numeric
print("Data types in customer_features after encoding:", customer_features.dtypes)

# Drop any non-numeric columns
customer_matrix = customer_features.select_dtypes(include=['float64', 'int64'])


Data types in customer_features after encoding: CustomerID                  object
TotalSpend                 float64
AvgSpend                   float64
CustomerName                object
SignupDate                  object
Region_Asia                  uint8
Region_Europe                uint8
Region_North America         uint8
Region_South America         uint8
TopCategory_Books            uint8
TopCategory_Clothing         uint8
TopCategory_Electronics      uint8
TopCategory_Home Decor       uint8
dtype: object


Step 8: Calculate Similarity

In [20]:
similarity_matrix = cosine_similarity(customer_matrix)

Step 9: Generate Recommendations

In [21]:

lookalike_map = {}
for idx, customer_id in enumerate(customer_features['CustomerID']):
    similarity_scores = list(enumerate(similarity_matrix[idx]))
    similar_customers = sorted(similarity_scores, key=lambda x: x[1], reverse=True)[1:4]
    lookalike_map[customer_id] = [
        (customer_features['CustomerID'].iloc[i], round(score, 2)) for i, score in similar_customers
    ]

Step 10: Save Lookalike Map to CSV

In [23]:
lookalike_df = pd.DataFrame({
    'CustomerID': lookalike_map.keys(),
    'Lookalikes': [str(v) for v in lookalike_map.values()]
})
lookalike_df.to_csv('Lookalike.csv', index=False)

# Display Lookalike Map for the first 20 customers
lookalike_map_subset = {k: v for i, (k, v) in enumerate(lookalike_map.items()) if i < 20}
print(lookalike_map_subset)

{'C0001': [('C0010', 1.0), ('C0009', 1.0), ('C0029', 1.0)], 'C0002': [('C0151', 1.0), ('C0029', 1.0), ('C0043', 1.0)], 'C0003': [('C0005', 1.0), ('C0178', 1.0), ('C0144', 1.0)], 'C0004': [('C0067', 1.0), ('C0021', 1.0), ('C0075', 1.0)], 'C0005': [('C0003', 1.0), ('C0073', 1.0), ('C0178', 1.0)], 'C0006': [('C0079', 1.0), ('C0117', 1.0), ('C0196', 1.0)], 'C0007': [('C0085', 1.0), ('C0140', 1.0), ('C0092', 0.99)], 'C0008': [('C0194', 1.0), ('C0154', 1.0), ('C0179', 1.0)], 'C0009': [('C0025', 1.0), ('C0001', 1.0), ('C0010', 1.0)], 'C0010': [('C0001', 1.0), ('C0009', 1.0), ('C0029', 1.0)], 'C0011': [('C0183', 1.0), ('C0048', 1.0), ('C0016', 1.0)], 'C0012': [('C0136', 1.0), ('C0102', 1.0), ('C0155', 1.0)], 'C0013': [('C0126', 1.0), ('C0045', 1.0), ('C0143', 1.0)], 'C0014': [('C0192', 1.0), ('C0128', 1.0), ('C0031', 1.0)], 'C0015': [('C0035', 1.0), ('C0146', 1.0), ('C0132', 1.0)], 'C0016': [('C0048', 1.0), ('C0183', 1.0), ('C0064', 1.0)], 'C0017': [('C0162', 1.0), ('C0113', 1.0), ('C0093', 0.

Evaluation Criteria:
Model Accuracy and Logic: The accuracy of the similarity calculation is based on the chosen feature set (spending behavior, product categories, etc.) and the choice of similarity measure (Cosine Similarity or Euclidean Distance).

Quality of Recommendations and Similarity Scores: You can assess the quality by checking if the similar customers have comparable purchasing behavior or demographic similarities. Fine-tune features like TotalSpend and AvgSpend to improve the results.