In [171]:
#!pip install statsmodels
import numpy as np
import pandas as pd
import datetime
from matplotlib import pyplot as plt
from statsmodels.tsa.arima.model import ARIMA
from pandas.plotting import autocorrelation_plot
import operator

In [172]:
sessions_df = pd.read_json("./../data/sessions.jsonl", lines=True)
sessions_df.set_index('session_id', inplace=True)

In [173]:
product_df = pd.read_json("./../data/products.jsonl", lines=True)
product_df.set_index('product_id', inplace=True)
product_df = product_df.replace([np.inf, -np.inf], np.nan)
product_df = product_df.dropna(subset=["price"])
product_df = product_df[product_df.price >= 0]

In [174]:
def get_products_bought_in_week(week_number, df, sessions_df):
    products = []
    x = df.loc[df['weeks_from_start'] == week_number].index.values
    start = x[0]
    product_list = list(sessions_df['product_id'])
    end = start + np.timedelta64(6,'D')
    sessions_df = sessions_df[(sessions_df['event_type'] == "BUY_PRODUCT")]
    for i, date in enumerate(sessions_df['timestamp']):
        date = np.datetime64(date)
        if start <= date <= end:
            products.append(product_list[i])
    return products

In [175]:
def create_weeks(df):
    dates = df['timestamp']
    dates = pd.to_datetime(dates)
    dates = dates.dt.to_period('W')
    dates = dates.drop_duplicates()
    dates = dates.sort_values()
    return dates

In [176]:
def split_data(y, split_ratio = 0.8):
    split_n = int(split_ratio*len(y))
    train = y.iloc[:split_n]
    test = y.iloc[split_n:]
    return train, test

In [177]:
def calculate_week_number(date = datetime.date.today()):
    first_date = create_weeks(sessions_df).values[0].start_time.date()
    return (date - first_date).days//7

In [178]:
def product_bought(product_id, df, dates, df_price = product_df):
    product_amount = df[(df['event_type'] == "BUY_PRODUCT") & (df['product_id'] == product_id)]
    
    daterange_df = pd.DataFrame()
    time_list = [(dates.values[i].start_time, dates.values[i].end_time) for i in range(len(dates))]
    daterange_df['count'] = [product_amount['timestamp'].between(s, e).sum() for s, e in time_list]
    daterange_df['price'] = list(product_df[product_df.index == 1114]['price'])[0]
    daterange_df['discount'] = 0
    keys = product_amount.index
    for i, time in enumerate(product_amount['timestamp']):
        product_amount['timestamp'][keys[i]] = calculate_week_number(time.date())
    keys_t = list(product_amount['timestamp'])
    j = 0
    for i in daterange_df.index:
        if i in keys_t:
            daterange_df['discount'][i] = product_amount['offered_discount'][keys[j]]
            j += 1
    return daterange_df

In [179]:
def prepare_dataframe(df):
    dates = create_weeks(df)
    weeks = dates.values
    df3 = pd.DataFrame()
    df3['date'] = [weeks[i].start_time for i in range(len(weeks))]
    df2 = pd.DataFrame(index = df3['date'])
    df2['weeks_from_start'] = ((df2.index - df2.index[0]).days)//7
    return df2, dates

In [180]:
def prepare_data(product_id, df, source_df, dates):
    x = df['weeks_from_start'].values.reshape(-1, 1)
    y = product_bought(product_id, source_df, dates)['count'].values
    return x, y

In [181]:
def fit_model(dataset):
    model=ARIMA(dataset, order=(1,1,1))
    model_fit=model.fit()
    return model_fit

In [182]:
def get_model_summary(model):
    return model.summary()

In [183]:
def add_empty_rows(df, amount):
    for i in range(amount+1):
        df.loc[len(df)] = 0
    return df

In [184]:
def plot_autocorrelation(dataset):
    autocorrelation_plot(dataset)
    plt.show()

In [185]:
def predict(start_week, end_week, df, model):
    while end_week > len(df):
        df = add_empty_rows(df, end_week - len(df))
    df['forecast']=model.predict(start=start_week,end=end_week,dynamic=True)
    return df

In [186]:
def predict_products(sessions_df, week_number):
    products_predictions = {}
    product_list = list(sessions_df['product_id'])
    product_list = list(dict.fromkeys(product_list))
    df, dates = prepare_dataframe(sessions_df)
    for product in product_list:
        y = product_bought(product, sessions_df, dates)
        train, test = split_data(y)
        model_fit = fit_model(train['count'])
        predicted = predict(week_number-1, week_number+1, train, model_fit)
        products_predictions[product] = predicted['forecast'][week_number]
    return products_predictions

In [187]:
def calculate_success_criterion(week_number, item_amount = 150):
    predictions = predict_products(sessions_df, week_number)
    sorted_d = dict(sorted(predictions.items(), key=operator.itemgetter(1),reverse=True))
    best_products = dict(list(sorted_d.items())[:item_amount])
    chosen_products = list(best_products.keys())
    df, dates = prepare_dataframe(sessions_df)
    bought_products = get_products_bought_in_week(week_number, df, sessions_df)
    same = (set(chosen_products) & set(bought_products))
    diff = set(bought_products) - set(chosen_products)
    all = len(diff) + len(chosen_products)
    return len(same)/all

In [188]:
def product_viewed(product_id, df, dates, df_price = product_df):
    viewed = df[(df['event_type'] == "VIEW_PRODUCT") & (df['product_id'] == product_id)]
    product_amount = df[(df['event_type'] == "BUY_PRODUCT") & (df['product_id'] == product_id)]
    daterange_df = pd.DataFrame()
    time_list = [(dates.values[i].start_time, dates.values[i].end_time) for i in range(len(dates))]
    daterange_df['count'] = [product_amount['timestamp'].between(s, e).sum() for s, e in time_list]
    daterange_df['viewed'] = [viewed['timestamp'].between(s, e).sum() for s, e in time_list]
    daterange_df['price'] = list(product_df[product_df.index == 1114]['price'])[0]
    daterange_df['discount'] = 0
    keys = product_amount.index
    for i, time in enumerate(product_amount['timestamp']):
        product_amount['timestamp'][keys[i]] = calculate_week_number(time.date())
    keys = viewed.index
    for i, time in enumerate(viewed['timestamp']):
        viewed['timestamp'][keys[i]] = calculate_week_number(time.date())
    keys_t = list(viewed['timestamp'])
    j = 0
    for i in daterange_df.index:
        if i in keys_t:
            daterange_df['discount'][i] = viewed['offered_discount'][keys[j]]
            j += 1
    return daterange_df

In [189]:
def predict_products_multivar(sessions_df, week_number, y = None):
    products_predictions = {}
    product_list = list(sessions_df['product_id'])
    product_list = list(dict.fromkeys(product_list))
    df, dates = prepare_dataframe(sessions_df)
    # for product in product_list:
    product = 1114
    if y is None:
        y = product_viewed(product, sessions_df, dates)
    train, test = split_data(y)
    # train = z.iloc[:-30]
    model_fit = fit_model(train['count'])
    predicted = predict(week_number-1, week_number+1, train, model_fit)
    products_predictions[product] = predicted['forecast'][week_number]
    return products_predictions, test

In [190]:
def multi_var_arima(sessions_df, product, week_number):
    df, dates = prepare_dataframe(sessions_df)
    z = product_viewed(product, sessions_df, dates)
    z['wsp_sprzedazy'] = z['count']*z['price']*z['discount']
    z['wsp_wyswietlen'] = z['viewed']*z['price']*z['discount']
    pred, test = predict_products_multivar(sessions_df, week_number, z)
    return pred, test

In [191]:
def pred_all_products(sessions_df, week_number):
    products_predictions = {}
    product_list = list(sessions_df['product_id'])
    product_list = list(dict.fromkeys(product_list))
    for product in product_list:
        pred, test = multi_var_arima(sessions_df, product, week_number)
        products_predictions[product] = pred
    return products_predictions

In [192]:
def pred_plot(pred, test):
    pred.plot(legend=True)
    test['count'].plot(legend=True)

In [193]:
pred, test = multi_var_arima(sessions_df, 1114, 100)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  product_amount['timestamp'][keys[i]] = calculate_week_number(time.date())
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self._set_values(indexer, value)
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  product_amount['timestamp'][keys[i]] = calculate_week_number(time.date())
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-ver

In [194]:
# Predykcja dla wszystkich produktów za pomocą wielu atrybutów
# pred_all_products(sessions_df, 100)

In [195]:
# Obliczenie kryterium sukcesu dla modelu ARIMA z jednym atrybutem wejściowym - 
# liczba produktów kupionych w danym tygodniu
# calculate_success_criterion(100)