In [7]:
#importing and loading datsets
import pandas as pd
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.preprocessing import StandardScaler

customers_url = "https://drive.google.com/uc?id=1bu_--mo79VdUG9oin4ybfFGRUSXAe-WE"
products_url = "https://drive.google.com/uc?id=1IKuDizVapw-hyktwfpoAoaGtHtTNHfd0"
transactions_url = "https://drive.google.com/uc?id=1saEqdbBB-vuk2hxoAf4TzDEsykdKlzbF"

customers = pd.read_csv(customers_url)
products = pd.read_csv(products_url)
transactions = pd.read_csv(transactions_url)

transaction_details = transactions.merge(products, on='ProductID', how='left')             # Combining Transactions and Products
customer_features = transaction_details.merge(customers, on='CustomerID', how='left')
print("Columns in customer_features:", customer_features.columns)                          # checking the columns

if 'Price' not in customer_features.columns:
    print("'Price' column not found. Using the available columns for aggregation.")        # Checking for Price Column
    features = ['Quantity', 'TotalValue']
else:
    features = ['Quantity', 'TotalValue', 'Price']

customer_features_agg = customer_features.groupby('CustomerID')[features].mean().reset_index()
scaler = StandardScaler()
scaled_features = scaler.fit_transform(customer_features_agg[features])
                                                                                          # Computing cosine similarity score
similarity_matrix = cosine_similarity(scaled_features)
similarity_df = pd.DataFrame(similarity_matrix, index=customer_features_agg['CustomerID'], columns=customer_features_agg['CustomerID'])
lookalike_results = {}
for customer_id in customer_features_agg['CustomerID'].head(20):
    similar_customers = similarity_df[customer_id].sort_values(ascending=False)[1:4]
    lookalike_results[customer_id] = [(similar_customer, similar_customers[similar_customer]) for similar_customer in similar_customers.index]


lookalike_df = pd.DataFrame.from_dict(lookalike_results, orient='index', columns=['Lookalike & score_1', 'Lookalike & score_2', 'Lookalike & score_3'])
lookalike_df.index.name = 'CustomerID'

lookalike_df.to_csv("Pranav_Pakalapati_Lookalike.csv")                                  # output into csv file


Columns in customer_features: Index(['TransactionID', 'CustomerID', 'ProductID', 'TransactionDate',
       'Quantity', 'TotalValue', 'Price_x', 'ProductName', 'Category',
       'Price_y', 'CustomerName', 'Region', 'SignupDate'],
      dtype='object')
'Price' column not found. Using the available columns for aggregation.


In [6]:
from google.colab import files
files.download("Pranav_Pakalapati_Lookalike.csv")


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>