# Importing Libraries

In [1]:
import warnings
warnings.filterwarnings('ignore')
import pandas as pd
from lightfm import LightFM
from lightfm.data import Dataset
import numpy as np
import random

# Load datasets

In [2]:
users = pd.read_csv("./Dataset/users.csv")
products = pd.read_csv("./Dataset/products.csv")
orders = pd.read_csv("./Dataset/orders.csv")
order_items = pd.read_csv("./Dataset/order_items.csv")
inventory_items = pd.read_csv('./Dataset/inventory_items.csv')
data_path = './final_df.csv'
data = pd.read_csv(data_path)

In [3]:
columns_to_drop = [
    'Unnamed: 0.1','Unnamed: 0', 'created_at_item', 'status_item', 'inventory_item_id',
    'created_at_order', 'num_of_item', 'department', 'sku', 'distribution_center_id',
    'id_user', 'first_name', 'last_name', 'email', 'postal_code', 'state',
    'street_address', 'city', 'country', 'latitude_user', 'longitude_user',
    'traffic_source_user', 'created_at_user', 'id_dist', 'dist_center_name',
    'latitude_dist', 'longitude_dist', 'sequence_number', 'browser',
    'traffic_source_event', 'event_type'
]
data.drop(columns=columns_to_drop, inplace=True, errors='ignore')
data.drop_duplicates(keep='first', inplace=True)
bins = [0, 18, 25, 35, 45, 55, 65, 75, 85, 95, 105]
labels = [f'age:{i}-{j}' for i, j in zip(bins[:-1], bins[1:])]
data['age_bin'] = pd.cut(data['age'], bins=bins, labels=labels, right=False)

# Model

In [4]:
# Prepare the interaction data
interaction_data = data.groupby(['user_id', 'product_id']).size().reset_index(name='interaction_count')

# Initialize and fit the dataset
dataset = Dataset()
dataset.fit((row['user_id'] for index, row in interaction_data.iterrows()),
            (row['product_id'] for index, row in interaction_data.iterrows()))

(interactions, weights) = dataset.build_interactions(((row['user_id'], row['product_id']) for index, row in interaction_data.iterrows()))

# Initialize the LightFM model with the WARP loss function
model = LightFM(loss='warp')

# Train the model
model.fit(interactions, epochs=30, num_threads=2)


<lightfm.lightfm.LightFM at 0x108ea3f70>

In [5]:
def get_preferred_brand(user_id):
    purchased_products = order_items[order_items['user_id']==user_id]['product_id'].values
    purchased_brands = {}
    for i in purchased_products:
        brand = products[products['id']==i]['brand'].values[0]
        if brand in purchased_brands:
            purchased_brands[brand] += 1
        else:
            purchased_brands[brand] = 1
    max_value = max(purchased_brands.values())
    max_keys = [key for key, value in purchased_brands.items() if value == max_value]
    return random.choice(max_keys)

def get_user_age_and_spending(user_id):
    personal_records = data[data['user_id']==user_id]
    age = personal_records['age'].values[0]
    gender = personal_records['gender'].values[0]
    avg_spending = personal_records['sale_price'].sum()/personal_records['sale_price'].count()
    return age, gender, avg_spending

def get_returned_products(user_id):
    return order_items[(order_items['user_id']==user_id) & (order_items['status'].isin(['Cancelled', 'Returned']))]['product_id'].values

def get_best_sellers(num_recommendations=5):
    total_sales_per_product = order_items.groupby('product_id').size()
    top_selling_products = total_sales_per_product.sort_values(ascending=False).head(num_recommendations)
    return top_selling_products.index.tolist()

def get_new_arrivals(num_recommendations=5):
    inventory_data_sorted = inventory_items.sort_values(by='created_at', ascending=False)
    return inventory_data_sorted.head(num_recommendations)['product_id'].tolist()

def get_product_gender_preference(product_id):
    gender = inventory_items[inventory_items['product_id']==product_id]['product_department'].values[0]
    return 'M' if gender=='Men' else 'F'

def get_product_brand(product_id):
    return inventory_items[inventory_items['product_id']==product_id]['product_brand'].values[0]

def get_purchased_products(user_id):
    pur_prod = data[data.user_id == user_id]
    return pur_prod[['name', 'product_id', 'sale_price', 'brand']], pur_prod['product_id'].values  # Ensure this is a DataFrame

def get_product_price(product_id):
    return data[data['product_id']==product_id]['sale_price'].values[0]

def get_product_details_to_display(ids):
    data_list = []
    for id in ids:
        prod = products[products['id']==id]
        #status = data[data['product_id']==id]['status_item'].values[0]
        status = order_items[order_items['product_id']==id]['status'].values[0]
        data_f = {
            #'Product ID': id,
            'Name': prod['name'].values[0],
            'Retail Price': prod['retail_price'].values[0].round(2),
            'Brand': get_product_brand(id).title(),
            'Gender Preference': get_product_gender_preference(id),
            'Status': status
        }
        # Append the values to the list
        data_list.append(data_f)
        # Convert the list of dictionaries to a DataFrame
    result_df = pd.DataFrame(data_list)
    # Adjust index to start from 1
    result_df.index = result_df.index + 1
    return result_df

In [6]:
def sample_recommendation(model, dataset, user_ids, num_recommendations=5, max_preferred_brand=3, spending_range=(0.8, 1.2)):
    n_users, n_items = dataset.interactions_shape()
    recommend, output = [], []
    product_id_mapping = list(dataset.mapping()[2].keys())  # Prepare product ID mapping once

    for user_id in user_ids:
        if user_id in users['id'].values.flatten():
            user_index = dataset.mapping()[0][user_id]
            scores = model.predict(user_index, np.arange(n_items))
            preferred_brand = get_preferred_brand(user_id)
            _, gender, avg_spending = get_user_age_and_spending(user_id)  # Assuming gender is correctly retrieved here
            returned_products = set(get_returned_products(user_id))
            purchased_products = set(get_purchased_products(user_id))
            
            # Map scores to product IDs, excluding returned and purchased products
            product_scores = {product_id_mapping[i]: scores[i] for i in range(n_items) if i not in returned_products and i not in purchased_products}
            output.append(product_scores)  # Store product scores for output

            # Create a sorted list of products based on scores, filtering by gender
            valid_products = [
                (prod, score) for prod, score in product_scores.items()
                if prod not in purchased_products and get_product_gender_preference(prod) == gender
            ]
            valid_products.sort(key=lambda x: x[1], reverse=True)

            brand_recommendations = []
            other_recommendations = []

            for prod, score in valid_products:
                if len(brand_recommendations) + len(other_recommendations) == num_recommendations:
                    break
                product_price = get_product_price(prod)

                if avg_spending * spending_range[0] <= product_price <= avg_spending * spending_range[1]:
                    if get_product_brand(prod) == preferred_brand and len(brand_recommendations) < max_preferred_brand:
                        brand_recommendations.append(prod)
                    elif len(other_recommendations) < (num_recommendations - max_preferred_brand):
                        other_recommendations.append(prod)
            
            # If there are still slots left to fill, add more from other recommendations
            if len(brand_recommendations) + len(other_recommendations) < num_recommendations:
                fill_count = num_recommendations - (len(brand_recommendations) + len(other_recommendations))
                additional_products = [prod for prod, _ in valid_products if prod not in brand_recommendations and prod not in other_recommendations][:fill_count]
                other_recommendations.extend(additional_products)

            final_recommendations = brand_recommendations + other_recommendations
            print(f"Recommended items for user {user_id}: {final_recommendations}")
            recommend.append(final_recommendations)
        else:
            new_arrived = get_new_arrivals(num_recommendations//2+1)
            best_seller = get_best_sellers(num_recommendations//2+1)
            final_recommendations =  new_arrived + best_seller
            new_user_recommendation = np.random.choice(final_recommendations, size=num_recommendations, replace=False).tolist()
            print(f"Recommended items for user {user_id}: {new_user_recommendation}")
            recommend.append(new_user_recommendation)

    return recommend, output

# Recommendation

In [None]:
sample_user_ids = [83582]#, 37023, 15553, 7815, 60193, 3095873495]
recommend, scores = sample_recommendation(model, dataset, sample_user_ids)

In [None]:
for i, j, k in zip(sample_user_ids, recommend, scores):
    data_list = []
    for x in j:
        prod = products[products['id']==x]
        # Get the values for this iteration
        data_f = {
            'Name': prod['name'].values[0],
            'Retail Price': prod['retail_price'].values[0].round(2),
            'Brand': get_product_brand(x),
            'Gender Preference': get_product_gender_preference(x),
            'Product_score' : k[x]
        }
        data_list.append(data_f)
    result_df = pd.DataFrame(data_list)
    print('User ID: ',i)
    display(result_df)


# Evaluation

In [None]:
for i, j in zip(sample_user_ids, recommend):
    print('User ID: ',i,'Recommended Product IDs: ',j)
    if i in users['id'].values:
        user_detail = users[users['id']==i]
        print('Customer Name: ',(user_detail['first_name'].values+' '+user_detail['last_name'].values)[0])
        detail = get_user_age_and_spending(i)
        print(f'Age: {detail[0]} Gender: {detail[1]}    Average Spending: {detail[2]}')
        print('Preferred Brand: ',get_preferred_brand(i).title())
        pur_prod = data[data.user_id==i]
        print('Bought Products:')
        result_df = pd.DataFrame(pur_prod[['name', 'product_id','sale_price','brand']])
        result_df.reset_index(drop=True, inplace=True)
        display(result_df)
    else:
        print('New Customer: No details')
    print('\nRecommended Products:')
    data_list = []

    # Iterate over each value of k
    for k in j:
        prod = products[products['id']==k]
        data_f = {
            'Name': prod['name'].values[0],
            'Retail Price': prod['retail_price'].values[0].round(2),
            'Brand': get_product_brand(k),
            'Gender Preference': get_product_gender_preference(k)
        }
        # Append the values to the list
        data_list.append(data_f)
        # Convert the list of dictionaries to a DataFrame
    result_df = pd.DataFrame(data_list)
    display(result_df)

In [None]:
data['status_item'].value_counts()

In [None]:
data.columns

In [None]:
status_counts = data.groupby(['user_id', 'status_item']).size().unstack(fill_value=0)

# Define the statuses we are interested in
statuses_of_interest = ['Cancelled', 'Complete']  # Adjust as necessary for your data

# Check if these statuses exist in the data and create the total_required column only if they do
if all(status in status_counts.columns for status in statuses_of_interest):
    status_counts['total_required'] = status_counts[statuses_of_interest].sum(axis=1)
    # Optional: Show user_ids with the highest total of specific statuses
    max_user_id = status_counts['total_required'].idxmax()
    print(f"User ID with the highest total of 'Cancelled' and 'Completed' statuses: {max_user_id}")
else:
    print("One or more specified statuses do not exist in the data. Please check your status names.")

# Display the DataFrame with the 'total_required' column if it exists
if 'total_required' in status_counts.columns:
    print(status_counts[['total_required'] + statuses_of_interest])
else:
    print(status_counts[statuses_of_interest])


In [None]:
status_counts = order_items.groupby(['user_id', 'status']).size().unstack(fill_value=0)

# Define the statuses we are interested in
statuses_of_interest = ['Shipped', 'Complete', 'Processing']  # Adjust as necessary for your data

# Check if these statuses exist in the data and create the total_required column only if they do
if all(status in status_counts.columns for status in statuses_of_interest):
    status_counts['total_required'] = status_counts[statuses_of_interest].sum(axis=1)
else:
    print("One or more specified statuses do not exist in the data. Please check your status names.")

# Sort the DataFrame by the 'total_required' column, descending order
sorted_status_counts = status_counts.sort_values(by='total_required', ascending=False)

# Display the sorted DataFrame
print(sorted_status_counts[['total_required'] + statuses_of_interest])


In [None]:
order_items['user_id'].value_counts()