In [2]:
#Import all the important files 
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from datetime import date
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import linear_kernel
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.preprocessing import LabelEncoder

In [3]:
# creating a data frame from CSV
customer = pd.read_csv("Customer.csv")
prod_cat_info = pd.read_csv("prod_cat_info.csv")
transactions=pd.read_csv("Transactions.csv")

In [4]:
# changing column label to similar in all tables
customer.rename(columns = {'customer_Id':'cust_id'}, inplace = True)
# removing row if customer_id/prod_sub_cat_code is null from customer and prod_cat_info dataframe
customer.dropna(subset=['cust_id'],inplace=True)
prod_cat_info.dropna(subset=['prod_sub_cat_code'],inplace=True)

In [5]:
#display top 5 rows of customer data
customer.head(5)

Unnamed: 0,cust_id,DOB,Gender,city_code
0,268408,02-01-1970,M,4.0
1,269696,07-01-1970,F,8.0
2,268159,08-01-1970,F,8.0
3,270181,10-01-1970,F,2.0
4,268073,11-01-1970,M,1.0


In [6]:
#display top 5 rows of prod_cat_info data
prod_cat_info.head(5)

Unnamed: 0,prod_cat_code,prod_cat,prod_sub_cat_code,prod_subcat
0,1,Clothing,4,Mens
1,1,Clothing,1,Women
2,1,Clothing,3,Kids
3,2,Footwear,1,Mens
4,2,Footwear,3,Women


In [7]:
#display top 5 rows of transactions data
transactions.head(5)

Unnamed: 0,transaction_id,cust_id,tran_date,prod_subcat_code,prod_cat_code,Qty,Rate,Tax,total_amt,Store_type
0,80712190438,270351,28/02/14,1,1,-5,-772,405.3,-4265.3,e-Shop
1,29258453508,270384,27/02/14,5,3,-5,-1497,785.925,-8270.925,e-Shop
2,51750724947,273420,24/02/14,6,5,-2,-791,166.11,-1748.11,TeleShop
3,93274880719,271509,24/02/14,11,6,-3,-1363,429.345,-4518.345,e-Shop
4,51750724947,273420,23/02/14,6,5,-2,-791,166.11,-1748.11,TeleShop


In [8]:
#display rows and columns of customer data
customer.shape

(5647, 4)

In [9]:
#display rows and columns of prod_cat_info data
prod_cat_info.shape

(23, 4)

In [10]:
#display rows and columns of transactions data
transactions.shape

(23053, 10)

In [12]:

customer.dropna(subset=['cust_id'], inplace=True)
customer['Gender'].fillna('F', inplace=True)

# Convert 'DOB' column to datetime with the correct format
customer['DOB'] = pd.to_datetime(customer['DOB'], format='%d-%m-%Y')

# Fill missing values in 'DOB' with a default date
customer['DOB'].fillna(pd.to_datetime('1900-01-01'), inplace=True)

# Fill missing values in 'city_code' with -1
customer['city_code'].fillna(-1, inplace=True)

# Convert 'city_code' to numeric and then to integer
customer['city_code'] = abs(pd.to_numeric(customer['city_code'], errors='coerce').fillna(-1).astype(int))

# Verify the data types
print(customer.dtypes)


cust_id               int64
DOB          datetime64[ns]
Gender               object
city_code             int64
dtype: object


In [13]:
#display top 5 rows of transactions data
transactions.head(5)

Unnamed: 0,transaction_id,cust_id,tran_date,prod_subcat_code,prod_cat_code,Qty,Rate,Tax,total_amt,Store_type
0,80712190438,270351,28/02/14,1,1,-5,-772,405.3,-4265.3,e-Shop
1,29258453508,270384,27/02/14,5,3,-5,-1497,785.925,-8270.925,e-Shop
2,51750724947,273420,24/02/14,6,5,-2,-791,166.11,-1748.11,TeleShop
3,93274880719,271509,24/02/14,11,6,-3,-1363,429.345,-4518.345,e-Shop
4,51750724947,273420,23/02/14,6,5,-2,-791,166.11,-1748.11,TeleShop


In [14]:
# creating super table by joining customer and prod_cat_info to transactions
df1=transactions.merge(customer,how='left',on='cust_id')
transaction_master_bi=pd.merge(left=df1, right=prod_cat_info,how='left',left_on=['prod_cat_code','prod_subcat_code'],right_on=['prod_cat_code','prod_sub_cat_code'])
transaction_master_bi.drop(columns='prod_sub_cat_code',axis=1,inplace=True)


In [15]:
prod_cat_info

Unnamed: 0,prod_cat_code,prod_cat,prod_sub_cat_code,prod_subcat
0,1,Clothing,4,Mens
1,1,Clothing,1,Women
2,1,Clothing,3,Kids
3,2,Footwear,1,Mens
4,2,Footwear,3,Women
5,2,Footwear,4,Kids
6,3,Electronics,4,Mobiles
7,3,Electronics,5,Computers
8,3,Electronics,8,Personal Appliances
9,3,Electronics,9,Cameras


In [16]:
# correceting data type of numeric and date columns
transaction_master_bi['tran_date'] = pd.to_datetime(transaction_master_bi['tran_date'],infer_datetime_format=True)
transaction_master_bi['Qty']=abs(transaction_master_bi['Qty'])
transaction_master_bi['Rate']=abs(transaction_master_bi['Rate'])
transaction_master_bi['total_amt']=abs(transaction_master_bi['total_amt'])

  transaction_master_bi['tran_date'] = pd.to_datetime(transaction_master_bi['tran_date'],infer_datetime_format=True)
  transaction_master_bi['tran_date'] = pd.to_datetime(transaction_master_bi['tran_date'],infer_datetime_format=True)


In [17]:
# handling null values
transaction_master_bi.dropna(subset=['transaction_id','cust_id','prod_subcat_code','prod_cat_code','tran_date'],inplace=True)
transaction_master_bi['Store_type'].fillna('NA',inplace=True)
transaction_master_bi['Qty'].fillna(1,inplace=True)
transaction_master_bi['Qty'] = transaction_master_bi['Qty'].apply(lambda x: 1 if x == 0 else x)
transaction_master_bi['Rate'].fillna(transaction_master_bi['Rate'].mean(),inplace=True)

In [18]:
#Calculating Average Tax
avg_tax=transaction_master_bi['Tax'].mean()
amount_before_tax=transaction_master_bi['Rate']*transaction_master_bi['Qty']
avg_amount=amount_before_tax.mean()
avg_tax_rate=avg_tax/avg_amount
avg_tax=avg_tax_rate*avg_amount
transaction_master_bi['Tax'].fillna(avg_tax,inplace=True)
total_amount=transaction_master_bi['Rate']*transaction_master_bi['Qty']+transaction_master_bi['Tax']
avg_total_amount=total_amount.mean()
transaction_master_bi['total_amt'].fillna(avg_total_amount,inplace=True)


In [19]:
ftd=transaction_master_bi.groupby('cust_id').min()['tran_date']
transaction_master_bi['First Purchase Date']=transaction_master_bi.apply(lambda row: ftd.loc[row['cust_id']], axis=1)

In [21]:
from datetime import date
import numpy as np
today=date.today()
transaction_master_bi['user_age_in_system_month']=(today-transaction_master_bi['First Purchase Date'].dt.date)/np.timedelta64(1,'m')
transaction_master_bi['user_age_in_year']=(today-transaction_master_bi['DOB'].dt.date)/np.timedelta64(1,'D')/365
transaction_master_bi['user_age_in_year']=pd.to_numeric(transaction_master_bi['user_age_in_year'])
tot_txn=transaction_master_bi.groupby('cust_id').count()['transaction_id']
transaction_master_bi['lifetime_txn']=transaction_master_bi.apply(lambda row: tot_txn.loc[row['cust_id']], axis=1)
df=transaction_master_bi[['cust_id','prod_cat','Qty','total_amt']]

In [22]:
df['avg_price']=df.apply(lambda row: row['total_amt'] / row['Qty'], axis=1)
df1=df.groupby(['cust_id','prod_cat']).mean()['avg_price'].to_frame().reset_index()
df1.rename(columns = {'avg_price':'user_aov'}, inplace = True)
transaction_master_bi=pd.merge(left=transaction_master_bi, right=df1,how='left',left_on=['cust_id','prod_cat'],right_on=['cust_id','prod_cat'])


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['avg_price']=df.apply(lambda row: row['total_amt'] / row['Qty'], axis=1)


In [23]:
age_ranges = pd.IntervalIndex.from_tuples([(0, 18), (19, 25), (26, 35), (36, 120)])
transaction_master_bi["age_range"] = pd.cut(transaction_master_bi["user_age_in_year"], age_ranges, labels=["<18", "18-25", "25-35", ">35"])

In [24]:
transaction_master_bi["product_code"]=transaction_master_bi['prod_cat'].astype(str)+' '+transaction_master_bi['prod_subcat'].astype(str)

In [25]:
Product_purchase=transaction_master_bi.groupby(['prod_subcat','prod_cat']).agg({'transaction_id': 'nunique','cust_id': 'nunique','Qty':'sum','user_aov':'mean'}).reset_index()
Product_purchase.rename(columns = {'transaction_id':'unique_orders','cust_id':'users','Qty':'tot_orders','user_aov':'unit_price'},inplace=True)
Product_Master_bi=pd.merge(left=prod_cat_info, right=Product_purchase,how='left',left_on=['prod_cat','prod_subcat'],right_on=['prod_cat','prod_subcat'])
Product_Master_bi['product_code']=Product_Master_bi.apply(lambda row: str(row['prod_cat']) +' '+ str(row['prod_subcat']), axis=1)
platform_orders=sum(Product_Master_bi['tot_orders'])
Product_Master_bi['order_percentage']=Product_Master_bi['tot_orders']/platform_orders

In [26]:
customer_purchase=transaction_master_bi.groupby(['cust_id','prod_cat','prod_subcat']).agg({'transaction_id': 'nunique','Qty':'sum','total_amt':'mean','user_aov':'mean'}).reset_index()
customer_purchase.rename(columns = {'transaction_id':'unique_orders','Qty':'tot_orders','total_amt':'aov','user_aov':'unit_price'},inplace=True)
customer_purchase['product_code']=customer_purchase.apply(lambda row: str(row['prod_cat']) +' '+ str(row['prod_subcat']), axis=1)

In [27]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import linear_kernel
Product_Master_bi['content'] = Product_Master_bi['product_code']
# Create TF-IDF matrix
tfidf_vectorizer = TfidfVectorizer(stop_words='english')
tfidf_matrix = tfidf_vectorizer.fit_transform(Product_Master_bi['content'])

# Compute the cosine similarity
cosine_sim = linear_kernel(tfidf_matrix, tfidf_matrix)
product_mapping = {}
for idx, row in Product_Master_bi.iterrows():
    product_mapping[(row['product_code'])] = idx

In [28]:
# Function to get recommendations
def get_similar_products(product_info, cosine_sim=cosine_sim):
    product_indices = [product_mapping.get((product_code), -1) for product_code in product_info]
    product_indices = [idx for idx in product_indices if idx != -1]  # Remove -1 (not found) from the list
    all_recommendations = []

    for product_index in product_indices:
        sim_scores = list(enumerate(cosine_sim[product_index]))
        sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)
        sim_scores = sim_scores[0:]  # Get the top 3 similar products (excluding itself)
        indices = [index for index, _ in sim_scores]
        recommendations = Product_Master_bi.iloc[indices].copy()
        recommendations['sim_score'] = [sim_score for _, sim_score in sim_scores]  # Add sim_score column
        all_recommendations.append(recommendations)
    final_df=pd.concat(all_recommendations, ignore_index=True)
    net_score=final_df.groupby('product_code').sum()['sim_score'].to_frame().reset_index()
    recom=pd.merge(left=final_df, right=net_score,how='left',left_on=['product_code'],right_on=['product_code'])
    recom.drop(columns='sim_score_x',inplace=True)
    recom.drop_duplicates(inplace=True)
    return recom

In [29]:
def user_recommendation(user_id,top_n=10):
    user_data=customer_purchase[customer_purchase['cust_id']==user_id]
    user_unit_price_avg=sum(user_data['aov']*user_data['unique_orders'])/sum(user_data['tot_orders'])
    user_products=list(user_data['product_code'])
    similar_products=get_similar_products(user_products, cosine_sim=cosine_sim)
    similar_products['product_score']=similar_products['order_percentage']+similar_products['sim_score_y']
    similar_products['product_score'] = similar_products.apply(lambda row: row['product_score'] + 0.1 if (row['unit_price'] >=0.9*user_unit_price_avg) & (row['unit_price']<=1.1*user_unit_price_avg) else row['product_score'], axis=1)
    sorted_similar_products = similar_products.sort_values(by='product_score',ascending=False)
    recommended_products=sorted_similar_products[['product_code','product_score']]
    return recommended_products.head(top_n)


In [30]:
user_id=266783
top_n=15
product_recommendation=user_recommendation(user_id,top_n).reset_index()
product_recommendation

Unnamed: 0,index,product_code,product_score
0,6,Clothing Mens,1.542061
1,9,Footwear Mens,1.54059
2,0,Books Non-Fiction,1.042753
3,17,Bags Mens,0.992788
4,1,Books Fiction,0.783008
5,7,Clothing Women,0.544499
6,10,Footwear Women,0.544354
7,11,Footwear Kids,0.519574
8,8,Clothing Kids,0.517641
9,3,Books Children,0.278234


In [31]:
#accuracy check
from sklearn.model_selection import train_test_split
transaction_master_bi_train,transaction_master_bi_test=train_test_split(transaction_master_bi,test_size=0.25,random_state=42)

In [32]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import linear_kernel
Product_purchase_train=transaction_master_bi_train.groupby(['prod_subcat','prod_cat']).agg({'transaction_id': 'nunique','cust_id': 'nunique','Qty':'sum','user_aov':'mean'}).reset_index()
Product_purchase_train.rename(columns = {'transaction_id':'unique_orders','cust_id':'users','Qty':'tot_orders','user_aov':'unit_price'},inplace=True)
Product_Master_bi_train=pd.merge(left=prod_cat_info, right=Product_purchase_train,how='left',left_on=['prod_cat','prod_subcat'],right_on=['prod_cat','prod_subcat'])
Product_Master_bi_train['product_code']=Product_Master_bi_train.apply(lambda row: str(row['prod_cat']) +' '+ str(row['prod_subcat']), axis=1)
platform_orders_train=sum(Product_Master_bi_train['tot_orders'])
Product_Master_bi_train['order_percentage']=Product_Master_bi_train['tot_orders']/platform_orders
Product_Master_bi_train['content'] = Product_Master_bi_train['product_code']
# Create TF-IDF matrix
tfidf_vectorizer = TfidfVectorizer(stop_words='english')
tfidf_matrix = tfidf_vectorizer.fit_transform(Product_Master_bi_train['content'])

# Compute the cosine similarity
cosine_sim = linear_kernel(tfidf_matrix, tfidf_matrix)
product_mapping = {}
for idx, row in Product_Master_bi_train.iterrows():
    product_mapping[(row['product_code'])] = idx


In [33]:
customer_purchase_train=transaction_master_bi_train.groupby(['cust_id','prod_cat','prod_subcat']).agg({'transaction_id': 'nunique','Qty':'sum','total_amt':'mean','user_aov':'mean'}).reset_index()
customer_purchase_train.rename(columns = {'transaction_id':'unique_orders','Qty':'tot_orders','total_amt':'aov','user_aov':'unit_price'},inplace=True)
customer_purchase_train['product_code']=customer_purchase_train.apply(lambda row: str(row['prod_cat']) +' '+ str(row['prod_subcat']), axis=1)
platform_orders=sum(Product_Master_bi_train['tot_orders'])
Product_Master_bi_train['order_percentage']=Product_Master_bi_train['tot_orders']/platform_orders

In [34]:
def get_similar_products_accuracy(product_info, cosine_sim=cosine_sim):
    product_indices = [product_mapping.get((product_code), -1) for product_code in product_info]
    product_indices = [idx for idx in product_indices if idx != -1]  # Remove -1 (not found) from the list
    all_recommendations = []

    for product_index in product_indices:
        sim_scores = list(enumerate(cosine_sim[product_index]))
        sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)
        sim_scores = sim_scores[0:]
        indices = [index for index, _ in sim_scores]
        recommendations = Product_Master_bi_train.iloc[indices].copy()
        recommendations['sim_score'] = [sim_score for _, sim_score in sim_scores]  # Add sim_score column
        all_recommendations.append(recommendations)
    final_df=pd.concat(all_recommendations, ignore_index=True)
    net_score=final_df.groupby('product_code').sum()['sim_score'].to_frame().reset_index()
    recom=pd.merge(left=final_df, right=net_score,how='left',left_on=['product_code'],right_on=['product_code'])
    recom.drop(columns='sim_score_x',inplace=True)
    recom.drop_duplicates(inplace=True)
    return recom

In [35]:
def user_recommendation_accuracy(user_id,top_n=10):
    user_data=customer_purchase_train[customer_purchase_train['cust_id']==user_id]
    user_unit_price_avg=sum(user_data['aov']*user_data['unique_orders'])/sum(user_data['tot_orders'])
    user_products=list(user_data['product_code'].unique())
    similar_products=get_similar_products_accuracy(user_products, cosine_sim=cosine_sim)
    similar_products['product_score']=similar_products['order_percentage']+similar_products['sim_score_y']
    similar_products['net_score'] = similar_products.apply(lambda row: row['product_score'] + 0.1 if row['unit_price'] <= 1.2 * user_unit_price_avg else row['product_score'], axis=1)
    sorted_similar_products = similar_products.sort_values(by='net_score',ascending=False)
    recommended_products=sorted_similar_products[['product_code','product_score']]
    return recommended_products.head(top_n)

In [36]:
from sklearn.metrics import precision_score, recall_score, f1_score, average_precision_score
cust_ids_train = transaction_master_bi_train['cust_id'].unique().tolist()
cust_ids_test=transaction_master_bi_test['cust_id'].unique().tolist()
cust_ids_to_evaluate=list(set(cust_ids_test) & set(cust_ids_train))
actual_test_set = transaction_master_bi_test.groupby('cust_id')['product_code'].agg(list).to_dict()
precision_scores = []
recall_scores = []
f1_scores = []
average_precision_scores = []
for user_id in cust_ids_to_evaluate:
    Recommendations =user_recommendation_accuracy(user_id,top_n=15)
    actual_purchases_for_user =list(set(actual_test_set.get(user_id, [])))
    predicted_purchases_for_user = Recommendations['product_code'].tolist()

    # True labels for the user
    true_labels = [1 if prod_code in actual_purchases_for_user else 0 for prod_code in actual_test_set[user_id]]

    # Predicted scores for the recommendations
    predicted_scores = [1 if prod_code in predicted_purchases_for_user else 0 for prod_code in actual_test_set[user_id]]

    # Calculate precision, recall, F1-score, and average precision
    precision = precision_score(true_labels, predicted_scores)
    recall = recall_score(true_labels, predicted_scores)
    f1 = f1_score(true_labels, predicted_scores)
    average_precision = average_precision_score(true_labels, predicted_scores)

    precision_scores.append(precision)
    recall_scores.append(recall)
    f1_scores.append(f1)
    average_precision_scores.append(average_precision)

# Calculate the mean scores across users
mean_precision = sum(precision_scores) / len(precision_scores)
mean_recall = sum(recall_scores) / len(recall_scores)
mean_f1 = sum(f1_scores) / len(f1_scores)
mean_average_precision = sum(average_precision_scores) / len(average_precision_scores)



  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_pr

In [37]:
print(f"Mean Precision: {mean_precision:.4f}")
print(f"Mean Recall: {mean_recall:.4f}")
print(f"Mean F1-Score: {mean_f1:.4f}")
print(f"Mean Average Precision: {mean_average_precision:.4f}")

Mean Precision: 0.8041
Mean Recall: 0.7109
Mean F1-Score: 0.7421
Mean Average Precision: 1.0000
