In [3]:
import pandas as pd
import matplotlib.pyplot as plt
from client_analiser.models import ModelInterface, ModelA, ModelB

In [4]:
iteration_path = "iteration_3/"
deliveries_path = "../data/" + iteration_path + "raw/deliveries.jsonl"
products_path = "../data/" + iteration_path + "raw/products.jsonl"
sessions_path = "../data/" + iteration_path + "raw/sessions.jsonl"
users_path = "../data/" + iteration_path + "raw/users.jsonl"

In [5]:
deliveries_data = pd.read_json(deliveries_path, lines=True)
products_data = pd.read_json(products_path, lines=True)
sessions_data = pd.read_json(sessions_path, lines=True)
users_data = pd.read_json(users_path, lines=True)

In [6]:
sessions_data = sessions_data.sort_values(by=['timestamp'])
# sessions_data['timestamp_date'] = sessions_data['timestamp'].apply(lambda x: x.date())
sessions_data['timestamp_week'] = sessions_data['timestamp'].apply(lambda x: x.week)
sessions_data['timestamp_month'] = sessions_data['timestamp'].apply(lambda x: x.month)
sessions_data['timestamp_quarter'] = sessions_data['timestamp'].apply(lambda x: x.quarter)

In [7]:
train_data = sessions_data[sessions_data.timestamp_month < 12]
test_data = sessions_data[sessions_data.timestamp_month >= 12]

In [8]:
def get_user_id_from_session(session):
    sample_user_id = session['user_id'].iloc[0]
    for user_id in session['user_id']:
        if sample_user_id != user_id:
            raise Exception("How it is even possible")
    return sample_user_id

def get_user_expenses(user_session_data):
    d = {
        'user_id': get_user_id_from_session(user_session_data),
        'expenses': user_session_data[user_session_data['event_type'] == "BUY_PRODUCT"]['price'].sum()
    }
    # df = pd.DataFrame(data=d)
    return d


def calculate_expenses(sessions_data, products_data, users_data):
    enriched_sessions_data = pd.merge(sessions_data, products_data, on="product_id").sort_values(by=['timestamp'])
    user_expenses = []
    for user_id in range(users_data['user_id'].min(), users_data['user_id'].max() + 1):
        # for user_id in enriched_sessions_data['user_id'].unique():
        user_session_data = enriched_sessions_data[enriched_sessions_data['user_id'] == user_id]
        user_expenses.append(
            {
                'user_id': user_id,
                'expenses': user_session_data[user_session_data['event_type'] == "BUY_PRODUCT"]['price'].sum()
            }
        )
        # user_expenses.append(get_user_expenses(enriched_sessions_data[enriched_sessions_data['user_id'] == user_id]))
    return pd.DataFrame(data=user_expenses).set_index('user_id')


def loss(predictions, observations):
    unified_data = pd.merge(predictions, observations, on="user_id").sort_values(by=['user_id'])
    unified_data['difference'] = unified_data['user_expenses'] - unified_data['expenses']
    unified_data['difference_square'] = unified_data['difference'].apply(lambda x: x ** 2)
    return unified_data

In [10]:
observations = calculate_expenses(test_data, products_data, users_data)
observations

Unnamed: 0_level_0,expenses
user_id,Unnamed: 1_level_1
102,0.00
103,554.77
104,2332.01
105,0.00
106,0.00
...,...
297,109.00
298,2399.00
299,0.00
300,0.00


In [11]:
# Models
model_A: ModelInterface = ModelA()
model_B: ModelInterface = ModelB()

FileNotFoundError: [Errno 2] No such file or directory: '../../models/model_b_v1'

In [18]:
model_B.train_and_extract(sessions_data, users_data, products_data, observations)

Epoch 0 loss 3.26e+06
Epoch 1 loss 3.2e+06
Epoch 2 loss 3.25e+06
Epoch 3 loss 3.28e+06
Epoch 4 loss 3.21e+06
Epoch 5 loss 3.25e+06
Epoch 6 loss 3.21e+06
Epoch 7 loss 3.24e+06
Epoch 8 loss 3.28e+06
Epoch 9 loss 3.23e+06
Epoch 10 loss 3.23e+06
Epoch 11 loss 3.25e+06
Epoch 12 loss 3.22e+06
Epoch 13 loss 3.21e+06
Epoch 14 loss 3.22e+06
Epoch 15 loss 3.23e+06
Epoch 16 loss 3.25e+06
Epoch 17 loss 3.24e+06
Epoch 18 loss 3.22e+06
Epoch 19 loss 3.28e+06
Epoch 20 loss 3.24e+06
Epoch 21 loss 3.21e+06
Epoch 22 loss 3.21e+06
Epoch 23 loss 3.27e+06
Epoch 24 loss 3.23e+06
Epoch 25 loss 3.26e+06
Epoch 26 loss 3.22e+06
Epoch 27 loss 3.25e+06
Epoch 28 loss 3.21e+06
Epoch 29 loss 3.19e+06
Epoch 30 loss 3.23e+06
Epoch 31 loss 3.21e+06
Epoch 32 loss 3.22e+06
Epoch 33 loss 3.26e+06
Epoch 34 loss 3.23e+06
Epoch 35 loss 3.24e+06
Epoch 36 loss 3.23e+06
Epoch 37 loss 3.24e+06
Epoch 38 loss 3.21e+06
Epoch 39 loss 3.21e+06
Epoch 40 loss 3.2e+06
Epoch 41 loss 3.25e+06
Epoch 42 loss 3.22e+06
Epoch 43 loss 3.29e+06


In [21]:
import torch

In [26]:
torch.save(model_B.net.state_dict(), "../models/model_b_v1")

In [19]:
predictionsB = model_B.predict_expenses(products_data, deliveries_data, train_data, users_data)
predictionsB

     user_id  expenses  products_bought  events_number  sessions_number  \
0        139  23400.85               49            255               66   
1        242  44518.18               58            327               86   
2        108  16312.03               23            129               32   
3        143   7214.08               15             84               26   
4        140  32333.55               43            306               83   
..       ...       ...              ...            ...              ...   
195      289      0.00                0              7                1   
196      225    109.00                1              2                1   
197      152      0.00                0              5                2   
198      162     78.96                2              6                2   
199      229    245.00                1              5                1   

     average_discount  average_discount_on_bought      city  
0           11.019608                

{'139': 2.4053874,
 '242': 6.761814,
 '108': 0.9596191,
 '143': 1.2847893,
 '140': 4.243804,
 '202': 2.0199518,
 '270': 4.1948166,
 '212': 2.2083397,
 '189': 0.53052944,
 '205': 2.0364568,
 '255': 2.6278179,
 '215': 0.7134879,
 '273': 0.38127244,
 '281': 2.606027,
 '227': 3.5759642,
 '222': 0.8113728,
 '235': 1.6218488,
 '171': 2.511823,
 '188': 0.50300217,
 '135': 2.487667,
 '252': 2.30639,
 '238': 1.8506325,
 '145': 1.1416111,
 '286': 1.7612832,
 '157': 1.1589057,
 '265': 1.3290148,
 '127': 3.4027283,
 '159': 1.8828454,
 '119': 2.4876783,
 '274': 0.7746666,
 '160': 0.6531872,
 '105': 0.2769903,
 '261': 0.29673648,
 '185': 1.6448128,
 '183': 1.4641608,
 '192': 1.9714302,
 '186': 0.93932706,
 '102': 1.5567591,
 '124': 6.99047,
 '282': 0.891833,
 '210': 0.86308485,
 '125': 3.2755234,
 '272': 3.7039,
 '104': 2.7983406,
 '115': 2.4648428,
 '136': 2.1010253,
 '285': 2.174125,
 '133': 3.4574542,
 '187': 1.8504581,
 '175': 0.54898196,
 '161': 3.367254,
 '167': 2.2820067,
 '213': 3.456309,
 '

In [20]:
predictionsA = model_A.predict_expenses(products_data, deliveries_data, train_data, users_data)
predictionsA

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  sessions["timestamp"] = pd.to_datetime(sessions["timestamp"])
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  sessions['timestamp_interval'] = sessions['timestamp'].apply(lambda x: x.month)


{139: 180.71,
 242: 533.24,
 108: 1718.53,
 143: 830.67,
 140: 3541.68,
 202: 4220.94,
 270: 5580.33,
 212: 3510.65,
 189: 1847.17,
 205: 0.0,
 255: 2161.97,
 215: 1215.29,
 273: 2287.13,
 281: 0.0,
 227: 2059.77,
 222: 2313.95,
 235: 1114.4,
 171: 2303.78,
 188: 1264.86,
 135: 29.44,
 252: 0.0,
 238: 3749.14,
 145: 1218.36,
 286: 646.18,
 157: 773.74,
 265: 985.88,
 127: 2063.19,
 159: 321.27,
 119: 6.69,
 274: 2175.1,
 160: 1187.04,
 105: 1035.18,
 261: 1422.7,
 185: 2356.24,
 183: 728.38,
 192: 293.5,
 186: 1779.22,
 102: 161.0,
 124: 7255.94,
 282: 2314.94,
 210: 2620.66,
 125: 6221.44,
 272: 7325.82,
 104: 4476.97,
 115: 143.99,
 136: 2152.11,
 285: 509.28,
 133: 5321.47,
 187: 768.43,
 175: 656.75,
 161: 5879.92,
 167: 541.17,
 213: 4658.1,
 292: 4557.16,
 256: 28.46,
 278: 64.0,
 263: 0.0,
 148: 855.29,
 200: 1640.37,
 280: 5114.39,
 287: 5961.86,
 208: 932.17,
 149: 1544.01,
 233: 2530.85,
 236: 453.32,
 246: 3274.73,
 193: 514.6,
 147: 25.45,
 158: 2853.03,
 251: 1639.52,
 277

In [25]:
loss(predictionsA, observations)

Unnamed: 0_level_0,user_expenses,expenses,difference,difference_square
user_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
102,161.00,0.00,161.00,2.592100e+04
103,4688.73,554.77,4133.96,1.708963e+07
104,4476.97,2332.01,2144.96,4.600853e+06
105,1035.18,0.00,1035.18,1.071598e+06
106,3728.79,0.00,3728.79,1.390387e+07
...,...,...,...,...
297,2054.55,109.00,1945.55,3.785165e+06
298,3417.61,2399.00,1018.61,1.037566e+06
299,3040.79,0.00,3040.79,9.246404e+06
300,3630.74,0.00,3630.74,1.318227e+07
