In [2]:
# Import libraries
import pandas as pd
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.preprocessing import StandardScaler

customers = pd.read_csv((r"C:\Users\sachu\Downloads\OneDrive\Customers.csv"))
products = pd.read_csv(r"C:\Users\sachu\Downloads\OneDrive\Products.csv")
transactions = pd.read_csv(r"C:\Users\sachu\Downloads\OneDrive\Transactions.csv")

# Merge transactions with products
transactions_products = pd.merge(transactions, products, on='ProductID', how='left')

# Merge with customers
merged_data = pd.merge(transactions_products, customers, on='CustomerID', how='left')

# Aggregate data to calculate customer-level purchase behavior
customer_data = merged_data.groupby('CustomerID').agg(
    TotalSpent=('TotalValue', 'sum'),
    TotalQuantity=('Quantity', 'sum'),
    AvgTransactionValue=('TotalValue', 'mean'),
    Regions=('Region', 'first')  # Assume Region is the same for each customer
).reset_index()

print(customer_data.head())


  CustomerID  TotalSpent  TotalQuantity  AvgTransactionValue        Regions
0      C0001     3354.52             12              670.904  South America
1      C0002     1862.74             10              465.685           Asia
2      C0003     2725.38             14              681.345  South America
3      C0004     5354.88             23              669.360  South America
4      C0005     2034.24              7              678.080           Asia


In [3]:
# One-hot encode the 'Region' column
customer_data = pd.get_dummies(customer_data, columns=['Regions'], drop_first=True)

# Standardize numerical features
scaler = StandardScaler()
numerical_cols = ['TotalSpent', 'TotalQuantity', 'AvgTransactionValue']
customer_data[numerical_cols] = scaler.fit_transform(customer_data[numerical_cols])

print(customer_data.head())


  CustomerID  TotalSpent  TotalQuantity  AvgTransactionValue  Regions_Europe  \
0      C0001   -0.061701      -0.122033            -0.070263           False   
1      C0002   -0.877744      -0.448000            -0.934933           False   
2      C0003   -0.405857       0.203934            -0.026271           False   
3      C0004    1.032547       1.670787            -0.076769           False   
4      C0005   -0.783929      -0.936951            -0.040028           False   

   Regions_North America  Regions_South America  
0                  False                   True  
1                  False                  False  
2                  False                   True  
3                  False                   True  
4                  False                  False  


In [4]:
# Compute similarity scores
customer_ids = customer_data['CustomerID']
features = customer_data.drop(['CustomerID'], axis=1)
similarity_matrix = cosine_similarity(features)

# Create a DataFrame for similarity
similarity_df = pd.DataFrame(similarity_matrix, index=customer_ids, columns=customer_ids)

print(similarity_df.head())


CustomerID     C0001     C0002     C0003     C0004     C0005     C0006  \
CustomerID                                                               
C0001       1.000000  0.126983  0.901451  0.330662  0.133845  0.457779   
C0002       0.126983  1.000000  0.193944 -0.528434  0.689766 -0.684003   
C0003       0.901451  0.193944  1.000000  0.381233  0.095422  0.366739   
C0004       0.330662 -0.528434  0.381233  1.000000 -0.879882  0.263998   
C0005       0.133845  0.689766  0.095422 -0.879882  1.000000 -0.117982   

CustomerID     C0007     C0008     C0009     C0010  ...     C0191     C0192  \
CustomerID                                          ...                       
C0001       0.062142 -0.047836  0.136723  0.084226  ...  0.938920  0.718077   
C0002       0.059228  0.041652  0.894822  0.797524  ...  0.454431  0.755641   
C0003       0.015691  0.042653  0.092188  0.202265  ...  0.847917  0.649557   
C0004      -0.718831  0.585021 -0.630963 -0.284865  ...  0.114885 -0.283647   
C0005  

In [8]:
import pandas as pd
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.preprocessing import StandardScaler

#ad Load datasets
customers = pd.read_csv((r"C:\Users\sachu\Downloads\OneDrive\Customers.csv"))
products = pd.read_csv(r"C:\Users\sachu\Downloads\OneDrive\Products.csv")
transactions = pd.read_csv(r"C:\Users\sachu\Downloads\OneDrive\Transactions.csv")

# Merge datasets to create a unified view
transactions_products = pd.merge(transactions, products, on='ProductID', how='left')
merged_data = pd.merge(transactions_products, customers, on='CustomerID', how='left')

# Aggregate customer purchase behavior (e.g., by product category and total spent)
customer_features = merged_data.groupby('CustomerID').agg(
    TotalSpent=('TotalValue', 'sum'),
    AvgSpentPerTransaction=('TotalValue', 'mean'),
    NumTransactions=('TransactionID', 'count'),
    NumCategories=('Category', pd.Series.nunique)
).reset_index()

# Standardize the features for similarity calculation
scaler = StandardScaler()
scaled_features = scaler.fit_transform(customer_features.iloc[:, 1:])

# Calculate pairwise cosine similarity between customers
similarity_matrix = cosine_similarity(scaled_features)

# Find top 3 similar customers for CustomerID: C0001 - C0020
customer_ids = customer_features['CustomerID'].tolist()
lookalike_results = {}

for idx, cust_id in enumerate(customer_ids[:20]):  # For the first 20 customers
    # Get similarity scores for the current customer
    similarity_scores = list(enumerate(similarity_matrix[idx]))
    
    # Exclude self-similarity (score with itself) and sort by similarity score
    similarity_scores = sorted(
        [(customer_ids[i], score) for i, score in similarity_scores if customer_ids[i] != cust_id],
        key=lambda x: x[1], reverse=True
    )
    
    # Get the top 3 most similar customers
    lookalike_results[cust_id] = similarity_scores[:3]

# Convert results into the required Lookalike.csv format
lookalike_df = pd.DataFrame([
    {"CustomerID": cust_id, "Lookalikes": lookalikes}
    for cust_id, lookalikes in lookalike_results.items()
])

# Save results to CSV
lookalike_df.to_csv('Lookalike.csv', index=False)


In [9]:
# Read and preview the generated Lookalike.csv file
generated_df = pd.read_csv('Lookalike.csv')
print(generated_df.head(20))  # Display the first 10 rows


   CustomerID                                         Lookalikes
0       C0001  [('C0086', 0.9965598542786218), ('C0189', 0.99...
1       C0002  [('C0199', 0.9982471461816442), ('C0010', 0.99...
2       C0003  [('C0178', 0.9995885797002526), ('C0036', 0.97...
3       C0004  [('C0101', 0.9970619994749313), ('C0156', 0.99...
4       C0005  [('C0073', 0.9996684843099329), ('C0159', 0.99...
5       C0006  [('C0079', 0.99998217937406), ('C0196', 0.9919...
6       C0007  [('C0080', 0.9929249994633612), ('C0078', 0.99...
7       C0008  [('C0109', 0.9709038923612349), ('C0147', 0.94...
8       C0009  [('C0077', 0.9997951665967079), ('C0083', 0.99...
9       C0010  [('C0002', 0.9979534969456437), ('C0199', 0.99...
10      C0011  [('C0114', 0.9983696656678615), ('C0183', 0.99...
11      C0012  [('C0155', 0.9978463141889224), ('C0065', 0.99...
12      C0013  [('C0126', 0.9926214308944414), ('C0105', 0.99...
13      C0014  [('C0058', 0.9964466913798511), ('C0151', 0.99...
14      C0015  [('C0095',

In [10]:
import os

# Save the CSV file
file_name = "Lookalike.csv"
lookalike_df.to_csv(file_name, index=False)

# Get the absolute path of the file
file_path = os.path.abspath(file_name)
print(f"The file has been saved at: {file_path}")


The file has been saved at: C:\Users\sachu\Lookalike.csv
