## Importing Libraries

In [1]:
import pandas as pd
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.preprocessing import StandardScaler

## Loading Datasets

In [9]:
# Load datasets
customers = pd.read_csv(r"C:\Users\Rakshith\Downloads\Customers.csv")
products = pd.read_csv(r"C:\Users\Rakshith\Downloads\Products.csv")
transactions = pd.read_csv(r"C:\Users\Rakshith\Downloads\Transactions.csv")

##  Merging Datasets

In [16]:
# Merge datasets
merged_data = transactions.merge(customers, on='CustomerID').merge(products, on='ProductID')
# Check the columns of the merged DataFrame
print(merged_data.columns)

Index(['TransactionID', 'CustomerID', 'ProductID', 'TransactionDate',
       'Quantity', 'TotalValue', 'Price_x', 'CustomerName', 'Region',
       'SignupDate', 'ProductName', 'Category', 'Price_y'],
      dtype='object')


In [18]:
# Prepare features for similarity calculation
features = merged_data[['Region', 'Price_y', 'Quantity', 'TotalValue']]
features = pd.get_dummies(features, columns=['Region'])

# Standardize features
scaler = StandardScaler()
scaled_features = scaler.fit_transform(features)

# Calculate similarity matrix
similarity_matrix = cosine_similarity(scaled_features)

# Get the number of customers
num_customers = len(customers)

# Function to get lookalikes
def get_lookalikes(customer_id, top_n=3):
    try:
        index = customers[customers['CustomerID'] == customer_id].index[0]
        similar_indices = similarity_matrix[index].argsort()[-top_n-1:-1][::-1]
        
        # Prepare lookalikes with scores
        lookalikes = []
        for i in similar_indices:
            if i < num_customers and customers.iloc[i]['CustomerID'] != customer_id:  # Ensure index is valid and exclude the customer itself
                lookalikes.append((customers.iloc[i]['CustomerID'], similarity_matrix[index][i]))
        
        return lookalikes
    except IndexError:
        return []

# Generate lookalikes for first 20 customers
lookalike_results = {}
for cust_id in customers['CustomerID'][:20]:
    lookalikes = get_lookalikes(cust_id)
    if lookalikes:  # Only add if there are valid lookalikes
        lookalike_results[cust_id] = lookalikes

# Convert lookalike results to a DataFrame
lookalike_df = pd.DataFrame.from_dict(lookalike_results, orient='index')

# Convert the lookalikes column to a string representation
#lookalike_df['Lookalikes'] = lookalike_df[0].apply(lambda x: ', '.join([f"({cust_id}, {score:.2f})" for cust_id, score in (x if isinstance(x, list) else [x])]))
#lookalike_df = lookalike_df[['Lookalikes']]


# Save results to CSV
#lookalike_df.to_csv('Lookalike.csv',index=True)
import csv

with open('Lookalike.csv', 'w', newline='') as csvfile:
    writer = csv.writer(csvfile)
    #writer.writerow(['cust_id', 'lookalikes'])  # Write header if needed
    for cust_id, lookalikes in lookalike_results.items():
        writer.writerow([cust_id, lookalikes])

# Print the lookalike results for verification
for cust_id, lookalikes in lookalike_results.items():
    print(f"{cust_id}: {lookalikes}")

C0001: [('C0003', 1.0)]
C0004: [('C0010', 1.0000000000000002), ('C0013', 1.0000000000000002)]
C0005: [('C0011', 1.0000000000000002)]
C0010: [('C0013', 1.0000000000000002)]
C0013: [('C0010', 1.0000000000000002)]
C0014: [('C0015', 1.0000000000000002), ('C0097', 0.9993755620448931)]
C0015: [('C0097', 0.9993755620448931)]
C0017: [('C0096', 0.9990407104780167)]
C0018: [('C0036', 0.9955016644821895)]
C0019: [('C0098', 0.9996164922604521)]
