In [38]:
import pandas as pd
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.metrics.pairwise import cosine_similarity
merged_data = pd.read_csv('merged_data.csv')
total_spending = merged_data.groupby('CustomerID')['TotalValue'].sum().reset_index()
total_spending.columns = ['CustomerID', 'TotalSpending']

avg_transaction_value = merged_data.groupby('CustomerID')['TotalValue'].mean().reset_index()
avg_transaction_value.columns = ['CustomerID', 'AvgTransactionValue']
favorite_category = merged_data.groupby(['CustomerID', 'Category']).size().reset_index(name='Count')
favorite_category = favorite_category.loc[favorite_category.groupby('CustomerID')['Count'].idxmax()]
favorite_category = favorite_category[['CustomerID', 'Category']]

num_transactions = merged_data.groupby('CustomerID')['TransactionID'].count().reset_index()
num_transactions.columns = ['CustomerID', 'NumTransactions']

region = merged_data[['CustomerID', 'Region']].drop_duplicates()

features = pd.merge(total_spending, avg_transaction_value, on='CustomerID')
features = pd.merge(features, favorite_category, on='CustomerID')
features = pd.merge(features, num_transactions, on='CustomerID')
features = pd.merge(features, region, on='CustomerID')

features = pd.get_dummies(features, columns=['Category', 'Region'], drop_first=True)
scaler = StandardScaler()
numerical_features = ['TotalSpending', 'AvgTransactionValue', 'NumTransactions']
features[numerical_features] = scaler.fit_transform(features[numerical_features])

features.set_index('CustomerID', inplace=True)

In [40]:
similarity_matrix = cosine_similarity(features)
similarity_df = pd.DataFrame(similarity_matrix, index=features.index, columns=features.index)

In [42]:
def get_top_similar_customers(customer_id, similarity_df, top_n=3):
    similarity_scores = similarity_df.loc[customer_id]
    similar_customers = similarity_scores.sort_values(ascending=False).iloc[1:top_n+1]
    return similar_customers
lookalike_map = {}
for customer_id in features.index[:20]:
    similar_customers = get_top_similar_customers(customer_id, similarity_df)
    lookalike_map[customer_id] = list(zip(similar_customers.index, similar_customers.values))
lookalike_df = pd.DataFrame(lookalike_map.items(), columns=['CustomerID', 'SimilarCustomers'])
lookalike_df.to_csv('Sayantan_Banerjee_Lookalike.csv', index=False)

In [44]:
# Import necessary libraries
import pandas as pd
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.metrics.pairwise import cosine_similarity

# Load the datasets
customers = pd.read_csv('Customers.csv')
products = pd.read_csv('Products.csv')
transactions = pd.read_csv('Transactions.csv')

# Merge the datasets
merged_data = pd.merge(transactions, customers, on='CustomerID')
merged_data = pd.merge(merged_data, products, on='ProductID')

# Display the first few rows of the merged dataset
print("Merged Dataset:")
print(merged_data.head())

Merged Dataset:
  TransactionID CustomerID ProductID      TransactionDate  Quantity  \
0        T00001      C0199      P067  2024-08-25 12:38:23         1   
1        T00112      C0146      P067  2024-05-27 22:23:54         1   
2        T00166      C0127      P067   2024-04-25 7:38:55         1   
3        T00272      C0087      P067  2024-03-26 22:55:37         2   
4        T00363      C0070      P067  2024-03-21 15:10:10         3   

   TotalValue  Price_x     CustomerName         Region  SignupDate  \
0      300.68   300.68   Andrea Jenkins         Europe  2022-12-03   
1      300.68   300.68  Brittany Harvey           Asia  2024-09-04   
2      300.68   300.68  Kathryn Stevens         Europe  2024-04-04   
3      601.36   300.68  Travis Campbell  South America  2024-04-11   
4      902.04   300.68    Timothy Perez         Europe  2022-03-15   

                       ProductName     Category  Price_y  
0  ComfortLiving Bluetooth Speaker  Electronics   300.68  
1  ComfortLiving B

In [46]:
# Feature Engineering
# 1. Total spending per customer
total_spending = merged_data.groupby('CustomerID')['TotalValue'].sum().reset_index()
total_spending.columns = ['CustomerID', 'TotalSpending']

# 2. Average transaction value per customer
avg_transaction_value = merged_data.groupby('CustomerID')['TotalValue'].mean().reset_index()
avg_transaction_value.columns = ['CustomerID', 'AvgTransactionValue']

# 3. Favorite product category per customer
favorite_category = merged_data.groupby(['CustomerID', 'Category']).size().reset_index(name='Count')
favorite_category = favorite_category.loc[favorite_category.groupby('CustomerID')['Count'].idxmax()]
favorite_category = favorite_category[['CustomerID', 'Category']]

# 4. Number of transactions per customer
num_transactions = merged_data.groupby('CustomerID')['TransactionID'].count().reset_index()
num_transactions.columns = ['CustomerID', 'NumTransactions']

# 5. Region (categorical feature)
region = merged_data[['CustomerID', 'Region']].drop_duplicates()

# Merge all features into one DataFrame
features = pd.merge(total_spending, avg_transaction_value, on='CustomerID')
features = pd.merge(features, favorite_category, on='CustomerID')
features = pd.merge(features, num_transactions, on='CustomerID')
features = pd.merge(features, region, on='CustomerID')

# One-hot encode the 'Category' and 'Region' columns
features = pd.get_dummies(features, columns=['Category', 'Region'], drop_first=True)

# Normalize numerical features
scaler = StandardScaler()
numerical_features = ['TotalSpending', 'AvgTransactionValue', 'NumTransactions']
features[numerical_features] = scaler.fit_transform(features[numerical_features])

# Set CustomerID as the index
features.set_index('CustomerID', inplace=True)

# Display the first few rows of the features DataFrame
print("\nFeatures DataFrame:")
print(features.head())


Features DataFrame:
            TotalSpending  AvgTransactionValue  NumTransactions  \
CustomerID                                                        
C0001           -0.061701            -0.070263        -0.011458   
C0002           -0.877744            -0.934933        -0.467494   
C0003           -0.405857            -0.026271        -0.467494   
C0004            1.032547            -0.076769         1.356650   
C0005           -0.783929            -0.040028        -0.923530   

            Category_Clothing  Category_Electronics  Category_Home Decor  \
CustomerID                                                                 
C0001                   False                  True                False   
C0002                    True                 False                False   
C0003                   False                 False                 True   
C0004                   False                 False                False   
C0005                   False                  True  

In [48]:
# Calculate cosine similarity between customers
similarity_matrix = cosine_similarity(features)

# Convert the similarity matrix to a DataFrame
similarity_df = pd.DataFrame(similarity_matrix, index=features.index, columns=features.index)

# Display the similarity matrix for the first 5 customers
print("\nSimilarity Matrix (First 5 Customers):")
print(similarity_df.iloc[:5, :5])


Similarity Matrix (First 5 Customers):
CustomerID     C0001     C0002     C0003     C0004     C0005
CustomerID                                                  
C0001       1.000000  0.052207  0.471690  0.330347  0.476745
C0002       0.052207  1.000000  0.229411 -0.438839  0.435259
C0003       0.471690  0.229411  1.000000 -0.016789  0.309529
C0004       0.330347 -0.438839 -0.016789  1.000000 -0.662552
C0005       0.476745  0.435259  0.309529 -0.662552  1.000000


In [50]:
# Function to get top 3 similar customers
def get_top_similar_customers(customer_id, similarity_df, top_n=3):
    # Get similarity scores for the given customer
    similarity_scores = similarity_df.loc[customer_id]
    # Sort by similarity score (descending) and exclude the customer itself
    similar_customers = similarity_scores.sort_values(ascending=False).iloc[1:top_n+1]
    return similar_customers

# Generate recommendations for the first 20 customers (C0001 - C0020)
lookalike_map = {}
for customer_id in features.index[:20]:
    similar_customers = get_top_similar_customers(customer_id, similarity_df)
    lookalike_map[customer_id] = list(zip(similar_customers.index, similar_customers.values))

# Convert the map to a DataFrame
lookalike_df = pd.DataFrame(lookalike_map.items(), columns=['CustomerID', 'SimilarCustomers'])

# Save the results to a CSV file
lookalike_df.to_csv('Lookalike.csv', index=False)

# Display the Lookalike DataFrame
print("\nLookalike Recommendations:")
print(lookalike_df)


Lookalike Recommendations:
   CustomerID                                   SimilarCustomers
0       C0001  [(C0190, 0.968215451295126), (C0048, 0.9410720...
1       C0002  [(C0088, 0.9544024281819798), (C0134, 0.927314...
2       C0003  [(C0052, 0.9847977904024423), (C0152, 0.926264...
3       C0004  [(C0165, 0.9738306048704476), (C0155, 0.952873...
4       C0005  [(C0186, 0.9717374531257306), (C0146, 0.949313...
5       C0006  [(C0168, 0.978264001321209), (C0171, 0.9387564...
6       C0007  [(C0140, 0.9798101409546568), (C0115, 0.920161...
7       C0008  [(C0139, 0.9135397162249627), (C0194, 0.871524...
8       C0009  [(C0010, 0.9760669630706748), (C0198, 0.952035...
9       C0010  [(C0009, 0.9760669630706748), (C0111, 0.970850...
10      C0011  [(C0137, 0.924372866239144), (C0169, 0.8683356...
11      C0012  [(C0104, 0.9659896786844346), (C0113, 0.926666...
12      C0013  [(C0099, 0.9855644363688846), (C0108, 0.919846...
13      C0014  [(C0060, 0.9763044912298496), (C0128, 0.948666.

In [52]:
# Display the Lookalike.csv file
print("\nLookalike.csv File:")
print(pd.read_csv('Lookalike.csv'))

# Insights into the similarity scores
print("\nInsights into Similarity Scores:")
for customer_id, similar_customers in lookalike_map.items():
    print(f"Customer {customer_id} has the following top 3 lookalikes:")
    for similar_customer, score in similar_customers:
        print(f"  - Customer {similar_customer} with similarity score {score:.2f}")
    print()


Lookalike.csv File:
   CustomerID                                   SimilarCustomers
0       C0001  [('C0190', 0.968215451295126), ('C0048', 0.941...
1       C0002  [('C0088', 0.9544024281819798), ('C0134', 0.92...
2       C0003  [('C0052', 0.9847977904024423), ('C0152', 0.92...
3       C0004  [('C0165', 0.9738306048704476), ('C0155', 0.95...
4       C0005  [('C0186', 0.9717374531257306), ('C0146', 0.94...
5       C0006  [('C0168', 0.978264001321209), ('C0171', 0.938...
6       C0007  [('C0140', 0.9798101409546568), ('C0115', 0.92...
7       C0008  [('C0139', 0.9135397162249627), ('C0194', 0.87...
8       C0009  [('C0010', 0.9760669630706748), ('C0198', 0.95...
9       C0010  [('C0009', 0.9760669630706748), ('C0111', 0.97...
10      C0011  [('C0137', 0.924372866239144), ('C0169', 0.868...
11      C0012  [('C0104', 0.9659896786844346), ('C0113', 0.92...
12      C0013  [('C0099', 0.9855644363688846), ('C0108', 0.91...
13      C0014  [('C0060', 0.9763044912298496), ('C0128', 0.94...
14  