# Do zrobienia
- wyekstrahować jakoś sesje

### Polecenie od klienta:
*"Mamy co prawda dodatkowe benefity dla naszych najlepszych klientów, ale może dałoby
się ustalić kto potencjalnie jest skłonny wydawać u nas więcej?”*

### Zadanie biznesowe
Sugerowanie klientów, którzy mogą wrócić do serwisu.

### Zadanie modelowania
Model regresyjny, szacujący prawdopodobieństwo powrotu klienta do serwisu.

In [1]:
import pandas as pd
import matplotlib.pyplot as plt

In [2]:
iteration_path = "iteration_3/"
deliveries_path = "../data/" + iteration_path + "raw/deliveries.jsonl"
products_path = "../data/" + iteration_path + "raw/products.jsonl"
sessions_path = "../data/" + iteration_path + "raw/sessions.jsonl"
users_path = "../data/" + iteration_path + "raw/users.jsonl"

In [62]:
deliveries_data = pd.read_json(deliveries_path, lines=True)
products_data = pd.read_json(products_path, lines=True)
sessions_data = pd.read_json(sessions_path, lines=True)
users_data = pd.read_json(users_path, lines=True)

In [63]:
sessions_data

Unnamed: 0,session_id,timestamp,user_id,product_id,event_type,offered_discount,purchase_id
0,124,2021-05-19 06:57:15,102,1277,VIEW_PRODUCT,5,
1,124,2021-05-19 06:59:15,102,1276,VIEW_PRODUCT,5,
2,124,2021-05-19 07:02:36,102,1276,BUY_PRODUCT,5,20001.0
3,125,2021-04-23 13:46:03,102,1284,VIEW_PRODUCT,20,
4,125,2021-04-23 13:50:10,102,1292,VIEW_PRODUCT,20,
...,...,...,...,...,...,...,...
24569,6881,2021-11-03 05:28:10,301,1073,VIEW_PRODUCT,5,
24570,6881,2021-11-03 05:33:08,301,1201,VIEW_PRODUCT,5,
24571,6881,2021-11-03 05:36:58,301,1072,VIEW_PRODUCT,5,
24572,6881,2021-11-03 05:37:32,301,1222,VIEW_PRODUCT,5,


In [61]:
sessions_data = sessions_data.sort_values(by=['timestamp'])
sessions_data['timestamp_week'] = sessions_data['timestamp'].apply(lambda x: x.week)
sessions_data['timestamp_quarter'] = sessions_data['timestamp'].apply(lambda x: x.quarter)
sessions_data['timestamp_date'] = sessions_data['timestamp'].apply(lambda x: x.date())

Podział na zbiór testowy i treningowy

In [6]:
train_data = sessions_data[sessions_data.timestamp_quarter < 4]
test_data = sessions_data[sessions_data.timestamp_quarter == 4]

Przykład join-a

In [7]:
pd.merge(train_data, products_data, on="product_id").sort_values(by=['timestamp'])

Unnamed: 0,session_id,timestamp,user_id,product_id,event_type,offered_discount,purchase_id,timestamp_week,timestamp_quarter,timestamp_date,product_name,category_path,price
0,1615,2021-01-08 17:29:37,139,1032,VIEW_PRODUCT,10,,1,1,2021-01-08,LCD Iiyama E2280WSD,Komputery;Monitory;Monitory LCD,688.78
197,1615,2021-01-08 17:32:51,139,1033,VIEW_PRODUCT,10,,1,1,2021-01-08,LCD Iiyama T1932MSC,Komputery;Monitory;Monitory LCD,3029.00
198,1615,2021-01-08 17:35:32,139,1033,BUY_PRODUCT,10,20833.0,1,1,2021-01-08,LCD Iiyama T1932MSC,Komputery;Monitory;Monitory LCD,3029.00
383,1635,2021-01-10 05:56:20,139,1292,VIEW_PRODUCT,20,,1,1,2021-01-10,Philips SDV8622,Sprzęt RTV;Video;Telewizory i akcesoria;Anteny...,189.00
574,1635,2021-01-10 05:57:54,139,1291,VIEW_PRODUCT,20,,1,1,2021-01-10,Philips SDV6224,Sprzęt RTV;Video;Telewizory i akcesoria;Anteny...,168.79
...,...,...,...,...,...,...,...,...,...,...,...,...,...
8942,336,2021-09-30 23:03:20,106,1030,VIEW_PRODUCT,0,,39,3,2021-09-30,LCD Iiyama B2280WSD,Komputery;Monitory;Monitory LCD,739.00
8943,336,2021-09-30 23:05:46,106,1030,BUY_PRODUCT,0,20116.0,39,3,2021-09-30,LCD Iiyama B2280WSD,Komputery;Monitory;Monitory LCD,739.00
11546,6769,2021-09-30 23:42:01,298,1276,VIEW_PRODUCT,0,,39,3,2021-09-30,Apple iPad mini 64GB,Komputery;Tablety i akcesoria;Tablety,1816.97
13730,6769,2021-09-30 23:45:23,298,1277,VIEW_PRODUCT,0,,39,3,2021-09-30,Apple iPad mini 64GB 4G,Komputery;Tablety i akcesoria;Tablety,2317.02


In [15]:
def get_user_id_from_session(session):
    sample_user_id = session['user_id'].iloc[0]
    for user_id in session['user_id']:
        if sample_user_id != user_id:
            raise Exception("How it is even possible")
    return sample_user_id


In [16]:
def check_if_user_bought_something(session):
    for event_type in session['event_type']:
        if event_type == 'BUY_PRODUCT':
            return True
    return False

In [84]:
def extract_session(session_id):
    session = sessions_data[sessions_data['session_id'] == session_id]
    d = {'session_id': session_id, 'beginning': [min(session['timestamp'])], 'end': [max(session['timestamp'])], 'user_id' : get_user_id_from_session(session),
         'bought_product': check_if_user_bought_something(session)}
    df = pd.DataFrame(data=d)
    return df.set_index('session_id')

In [85]:
def extract_session_data(sessions_data):
    sessions = []
    for session_id in sessions_data['session_id'].unique():
        sessions.append(extract_session(session_id))
    extracted_session_data = pd.concat(sessions)
    return extracted_session_data

In [86]:
sth = extract_session_data(sessions_data)

In [87]:
def find_returned_users(extracted_sessions_data):
    user_counts = extracted_sessions_data['user_id'].value_counts()
    return user_counts[user_counts>=2].index

In [44]:
user_counts = sth.value_counts()
user_counts[user_counts>=2]


270    89
242    88
140    86
255    86
125    83
       ..
291     3
244     3
225     2
152     2
162     2
Name: user_id, Length: 196, dtype: int64

In [11]:
sessions_data[sessions_data['session_id'] == 1615]

Unnamed: 0,session_id,timestamp,user_id,product_id,event_type,offered_discount,purchase_id,timestamp_week,timestamp_quarter,timestamp_date
5462,1615,2021-01-08 17:29:37,139,1032,VIEW_PRODUCT,10,,1,1,2021-01-08
5463,1615,2021-01-08 17:32:51,139,1033,VIEW_PRODUCT,10,,1,1,2021-01-08
5464,1615,2021-01-08 17:35:32,139,1033,BUY_PRODUCT,10,20833.0,1,1,2021-01-08


In [52]:
session = sessions_data[sessions_data['session_id'] == 1615]

In [80]:
session = sessions_data[sessions_data['session_id'] == 1615]
extract_session(1615)
# get_user_id_from_session(session)
# check_if_user_bought_something(session)

Unnamed: 0,session_id,beginning,end,user_id,bought_product
0,1615,2021-01-08 17:29:37,2021-01-08 17:35:32,139,True


In [28]:
extract_session_data(train_data)

4651

# Do poprawy
- ustalone kryteria sukcesu -> pole pod krzywą ROC, zamiast wartość krzywej
- brak sprawdzenia typów atrybutów/zakresy wartości -> histogramy + mądre zakresy
- brak weryfikacji, czy dane wydają się nadawać do modelowania (czy zmienne wejściowe coś mówią o zmiennej wyjściowej) -> dobre pytanie, właśnie to badam