## Task 2




### Importing libraries

In [28]:
import pandas as pd
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.preprocessing import MinMaxScaler

### Load Data

In [29]:
customers = pd.read_csv('Customers.csv')
products = pd.read_csv('Products.csv')
transactions = pd.read_csv('Transactions.csv')

### Checking data types

In [30]:
customers['SignupDate'] = pd.to_datetime(customers['SignupDate'])
transactions['TransactionDate'] = pd.to_datetime(transactions['TransactionDate'])


### Merging Data

In [31]:
merged_data = pd.merge(transactions, customers, on='CustomerID')
merged_data = pd.merge(merged_data, products, on='ProductID')

### Feature engineering

In [32]:
customer_features = merged_data.groupby('CustomerID').agg(
    TotalSpending=('TotalValue', 'sum'),
    TransactionCount=('TransactionID', 'nunique'),
    AvgTransactionValue=('TotalValue', 'mean'),
    FavoriteCategory=('Category', lambda x: x.mode()[0]),
    AvgProductPrice=('Price_y', 'mean')  # Use 'Price_y' (from Products.csv)
).reset_index()

###  Merging with customer region

In [33]:
customer_features = pd.merge(customer_features, customers[['CustomerID', 'Region']], on='CustomerID')

### One-hot encode categorical features

In [34]:
customer_features = pd.get_dummies(customer_features, columns=['Region', 'FavoriteCategory'])

### Normalize numerical features

In [35]:
scaler = MinMaxScaler()
numerical_features = ['TotalSpending', 'TransactionCount', 'AvgTransactionValue', 'AvgProductPrice']
customer_features[numerical_features] = scaler.fit_transform(customer_features[numerical_features])

### Computing cosine similarity

In [36]:
similarity_matrix = cosine_similarity(customer_features.drop(columns=['CustomerID']))
lookalike_map = {}

## Get top 3 lookalikes for each customer

In [37]:
for i, customer in enumerate(customer_features['CustomerID']):
    similarity_scores = similarity_matrix[i]
    top_indices = np.argsort(similarity_scores)[-4:-1] 
    lookalikes = [(customer_features.iloc[idx]['CustomerID'], similarity_scores[idx]) for idx in top_indices]
    lookalike_map[customer] = lookalikes

### Results of the first 20 customers

In [38]:
lookalike_results = {k: v for k, v in lookalike_map.items() if k in [f'C{i:04d}' for i in range(1, 21)]}

### Converting to DataFrame and save as CSV

In [39]:
lookalike_df = pd.DataFrame(lookalike_results.items(), columns=['CustomerID', 'Lookalikes'])
lookalike_df.to_csv('Lookalike.csv', index=False)

In [40]:
print(lookalike_results)

{'C0001': [('C0048', 0.9950690613819692), ('C0181', 0.9965197369448543), ('C0190', 0.9966931152299675)], 'C0002': [('C0106', 0.9941927737587308), ('C0134', 0.994293589362952), ('C0088', 0.9972860777903121)], 'C0003': [('C0031', 0.9934455806554768), ('C0152', 0.9936573983161197), ('C0052', 0.9969992174703901)], 'C0004': [('C0153', 0.99090454545014), ('C0169', 0.9911838720110644), ('C0165', 0.9955186499931888)], 'C0005': [('C0007', 0.9945768778113777), ('C0146', 0.9961566777731068), ('C0186', 0.9981717526495861)], 'C0006': [('C0187', 0.9941245858589293), ('C0168', 0.9951676845599187), ('C0171', 0.9972810686606883)], 'C0007': [('C0140', 0.9937442904854265), ('C0005', 0.9945768778113777), ('C0115', 0.996488670368942)], 'C0008': [('C0038', 0.958087410056173), ('C0189', 0.9584174404031849), ('C0065', 0.978108948637362)], 'C0009': [('C0062', 0.9836452525114444), ('C0103', 0.9843189490454441), ('C0198', 0.9906325382380973)], 'C0010': [('C0198', 0.9820019158579849), ('C0062', 0.9920065603227987

## For real time Lookalike from user input 

### Dictionary to map CustomerID to index

In [41]:
customer_to_index = {customer: idx for idx, customer in enumerate(customer_features['CustomerID'])}

### Function to get top 3 lookalikes for a given customer

In [42]:
def get_lookalikes(customer_id):
    if customer_id not in customer_to_index:
        return "CustomerID not found in the dataset."
    idx = customer_to_index[customer_id]
    similarity_scores = similarity_matrix[idx]
    similarity_scores[idx] = -1  
    top_indices = np.argsort(similarity_scores)[-3:][::-1]  
    lookalikes = [(customer_features.iloc[i]['CustomerID'], similarity_scores[i]) for i in top_indices]
    return lookalikes

###  Real-time input

In [43]:
while True:
    user_input = input("Enter CustomerID (or 'exit' to quit): ")
    if user_input.lower() == 'exit':
        break
    recommendations = get_lookalikes(user_input)
    
    if isinstance(recommendations, str):
        print(recommendations)
    else:
        print(f"Top 3 lookalikes for {user_input}:")
        for cust_id, score in recommendations:
            print(f"CustomerID: {cust_id}, Similarity Score: {score:.4f}")

Top 3 lookalikes for C0001:
CustomerID: C0190, Similarity Score: 0.9967
CustomerID: C0181, Similarity Score: 0.9965
CustomerID: C0048, Similarity Score: 0.9951
Top 3 lookalikes for C0001:
CustomerID: C0190, Similarity Score: 0.9967
CustomerID: C0181, Similarity Score: 0.9965
CustomerID: C0048, Similarity Score: 0.9951
Top 3 lookalikes for C0002:
CustomerID: C0088, Similarity Score: 0.9973
CustomerID: C0134, Similarity Score: 0.9943
CustomerID: C0106, Similarity Score: 0.9942
