In [1]:
import keras
import pandas as pd
import pickle
import numpy as np
from sklearn.feature_extraction.text import CountVectorizer
import recmetrics as rec
import tqdm as tqdm
import unidecode
import joblib
import random
import tensorflow as tf

import preprocessing.sale_preprocessing as sales_prep

Num GPUs Available:  0


In [3]:
COLUMNNS_FOR_ML = ['log_delta','log_followers','conversion',
                   'log_revenue','log_brand_appearance','log_avg_price',
                   'artisanal', 'b_corporation', 'bio', 'biodegradable',
                   'cadeau_ideal', 'concept_original', 'durable',
                   'eco_friendly','excellent_sur_yuka', 'exclusivite_choose',
                   'fabrication_a_la_demande', 'fait_main', 'gluten_free',
                   'iconique', 'inclusive', 'innovation', 'made_in_europe',
                   'made_in_france', 'madeinjapan', 'naturel', 'oeko_tex',
                   'premium', 'recyclable', 'saint_valentin', 'savoir_faire',
                   'seconde_main', 'socialement_engagee', 'serie_limitee',
                   'tendance', 'upcycling', 'vegan', 'vintage', 'zerodechet',
                   'category_sale',
                   'log_monetary', 'log_frequency','log_recency',
                   'category_1','category_2', 'category_3'
                   ]

COLS_FOR_USERS = ['user_key','start_at', 'log_monetary', 'log_frequency', 'log_recency',
                  'category_1', 'category_2','category_3', 'log_delta']

COLS_FOR_SALES = ['sale_id', 'start_at', 'log_followers','conversion',
                   'log_revenue','log_brand_appearance','log_avg_price',
                   'artisanal', 'b_corporation', 'bio', 'biodegradable',
                   'cadeau_ideal', 'concept_original', 'durable',
                   'eco_friendly','excellent_sur_yuka', 'exclusivite_choose',
                   'fabrication_a_la_demande', 'fait_main', 'gluten_free',
                   'iconique', 'inclusive', 'innovation', 'made_in_europe',
                   'made_in_france', 'madeinjapan', 'naturel', 'oeko_tex',
                   'premium', 'recyclable', 'saint_valentin', 'savoir_faire',
                   'seconde_main', 'socialement_engagee', 'serie_limitee',
                   'tendance', 'upcycling', 'vegan', 'vintage', 'zerodechet',
                   'category_sale']

In [4]:
scored_train_data = pd.read_csv('training_preparation/preped_train_data.csv', index_col=0).dropna().reset_index(drop=True)
scored_test_data = pd.read_csv('training_preparation/preped_test_data.csv', index_col=0).dropna().reset_index(drop=True)

first_orders = pd.read_json('training_preparation/first_order_date.json')
first_orders['min_start_date'] = pd.to_datetime(first_orders['min_start_date'])

In [5]:
user_id = '00FGOyepkGazURwZ3nrvjTXsClS2'
raw_sales = pd.read_json('training_preparation/sales_mai22_mai23.json').dropna().reset_index(drop=True)

dated_test_data = scored_test_data.merge(raw_sales[['start_at','sale_id']], on='sale_id', how ='left')
dated_test_data['start_at'] = pd.to_datetime(dated_test_data['start_at'])


user_information = dated_test_data[COLS_FOR_USERS].drop_duplicates()
all_sales = dated_test_data[COLS_FOR_SALES].drop_duplicates()

#pickle.load(open('TF_model.pkl', 'rb'))


In [6]:
model = tf.keras.models.load_model('TF_model.h5')

In [7]:
def creating_entry_data(user_id, available_sales: pd.DataFrame, user_information: pd.DataFrame, df_first_order:pd.DataFrame):

    # modifying user_info to have cold start if the first date is in the test set
    cold_start_date = df_first_order[df_first_order['user_key']==user_id]['min_start_date'].iloc[0]
    user_info = user_information[user_information['user_key']==user_id][COLS_FOR_USERS].drop_duplicates()
    user_info.loc[user_info['start_at'] == cold_start_date, ['monetary', 'recency', 'frequency']] = 0
    user_info.loc[user_info['start_at'] == cold_start_date, ['category_1', 'category_2', 'category_3']] = 0

    # Add log_delta and merge overall dataframe
    X = available_sales.merge(user_info, on='start_at', how='left')
    return X


In [8]:
def ranking(df_for_prediction: pd.DataFrame, model):
    """ 
    The aim of this function is to score and rank the given sales using the model. 

    :param df_for_prediction: containing the sale and user information 
    :param model: model used for the scoring
    """
    # dataframe created to store the sales and their score
    predicted = pd.DataFrame(df_for_prediction['sale_id'])
    # predictions and storing
    predicted_score = model.predict(df_for_prediction[COLUMNNS_FOR_ML].values)
    predicted['score'] = [i for i in predicted_score.flatten()]

    # sorting the scales by score with the best ones at the top
    ordered_sales = predicted.sort_values('score',  ascending=False)
    
    return ordered_sales

In [9]:
def make_preditions(user_id, available_sales, model, scored_data):
    """ 
    The aim of this function is to predict the ranking of given sales 
    for a specific user using a given model. 

    :param user_id: a string containing the user id to predict for
    :param available_sales: pd.DataFrame containing the information of the sales 
    to rank
    :param model: model used for the scoring
    """
    
    X = creating_entry_data(user_id, available_sales, scored_data, first_orders)
    prediction = ranking(X, model)
    prediction = prediction.reset_index(drop=True)

    return prediction

In [10]:
def predicted_and_truth_for_user(user_id, available_sales, model, scored_data):
    
    """ 
    The aim of this function is to retrun the predicted ranking using a given model
    and real interactions for a specific user. The prediction is done on all the 
    sales available on the days the user had an interaction.

    :param user_id: a string containing the user id to predict for
    :param available_sales: pd.DataFrame containing the information of the sales 
    to rank. For each user, it is filtered on the dates where he had interactions.
    :param model: model used for the scoring
    :param scored_train_data: pd.DataFrame containing the known interactions of users
    """

    #selectionner les dates de start_at sur lequel l'utilisateur aurait pu acheter
    available_dates =  scored_data['start_at'][(scored_data['user_key'] == user_id ) & (scored_data['interaction'] ==  1)].tolist()
    
    # selectionner les ventes associées à ces dates
    user_available_sales = pd.DataFrame(available_sales[available_sales['start_at'].isin(available_dates)])    
    prediction_and_truth = make_preditions(user_id, user_available_sales, model, scored_data)
    # truth = scored_data['sale_id'][(scored_data['user_key'] == user_id ) & (scored_data['interaction'] ==  1)]

    return prediction_and_truth['sale_id']#, truth

In [11]:
def precision(predicted, actual):
    prec = [value for value in predicted if value in actual]
    prec = float(len(prec)) / float(len(predicted))
    return prec

def apk(actual: list, predicted: list, k=10) -> float:
    if not predicted or not actual:
        return 0.0
    
    if len(predicted) > k:
        predicted = predicted[:k]

    score = 0.0
    true_positives = 0.0

    for i, p in enumerate(predicted):
        if p in actual and p not in predicted[:i]:
            max_ix = min(i + 1, len(predicted))
            score += precision(predicted[:max_ix], actual)
            true_positives += 1
    
    if score == 0.0:
        return 0.0
    
    return score / true_positives

In [12]:
def performance(scored_data, all_sales, model, k):

    available_users = np.array(scored_data['user_key'][scored_data['interaction'] ==  1].unique())

    truth = [scored_data['sale_id'][(scored_data['user_key'] == user) & (scored_data['interaction'] ==  1)].tolist() for user in tqdm.tqdm(available_users)]
    pred = [predicted_and_truth_for_user(user, all_sales, model, scored_data).tolist() for user in tqdm.tqdm(available_users)]
    
    perf = np.mean([apk(a,p,k) for a,p in zip(truth, pred)])
    return perf, truth, pred

In [9]:
print("Num GPUs Available: ", len(tf.config.list_physical_devices('GPU')))

Num GPUs Available:  0


In [None]:
tf.device
performance(dated_test_data, all_sales, model, 10)