In [1]:
from sklearn.tree import DecisionTreeRegressor
import pandas as pd

def load():
    data_dir = "/home/asia/Documents/projects/PyCharm/IUM/data/data_v3/"
    data = {}
    data["deliveries"] = pd.read_json(data_dir + "deliveries.jsonl", lines = True)
    data["sessions"] = pd.read_json(data_dir + "sessions.jsonl", lines = True)
    data["products"] = pd.read_json(data_dir + "products.jsonl", lines = True)
    data["users"] = pd.read_json(data_dir + "users.jsonl", lines = True)
    return data

def preprocess_time(deliveries: pd.DataFrame):
    deliveries["purchase_timestamp"] = pd.to_datetime ( deliveries["purchase_timestamp"], format='%Y-%m-%dT%H:%M', errors="coerce")
    deliveries['delivery_timestamp'] = pd.to_datetime ( deliveries["delivery_timestamp"], format='%Y-%m-%dT%H:%M:%S', errors="coerce")
    deliveries["time_difference"] = deliveries["delivery_timestamp"] - deliveries["purchase_timestamp"]
    deliveries["time[min]"] = deliveries["time_difference"].apply(lambda x: x.total_seconds()/60)
    deliveries = deliveries.drop(labels=["time_difference"], axis=1)
    return deliveries


def merge(data: {}) -> pd.DataFrame:
    s = data["sessions"]
    d = data["deliveries"]
    p = data["products"]
    u = data["users"]
    purchases = s[s["purchase_id"].notnull()].copy()
    merged = pd.merge(purchases, d, on='purchase_id', how = "outer")
    merged2 = pd.merge(merged, u, on="user_id", how = "outer")
    merged_full = pd.merge(merged2, p, on="product_id", how = "left")
    return merged_full

def drop_useless (merged):
    to_drop = ["session_id", "event_type", "timestamp", "delivery_timestamp", "purchase_id", "name"]
    return merged.drop(labels = to_drop, axis=1)

In [7]:
data = load()
data["deliveries"] = preprocess_time(data["deliveries"])
merged = merge(data)
merged.head()
merged.shape

(7288, 17)

In [9]:
len(merged["street"].unique())
# add purchase day of week, purchase hour, separate street and block number

200

In [6]:
# add postal code? http://kodpocztowy.intami.pl/

In [2]:
data_dir = "/home/asia/Documents/projects/PyCharm/IUM/data/data_v3/"
users = pd.read_json(data_dir + "users.jsonl", lines=True)
users.head()

Unnamed: 0,user_id,name,city,street
0,102,Aurelia Malon,Police,pl. Brzoskwiniowa 11/53
1,103,Mateusz Kobel,Police,al. Wrocławska 10
2,104,Radosław Ratka,Mielec,pl. Nowa 89/04
3,105,Anastazja Oszust,Szczecin,ul. Częstochowska 80
4,106,Sylwia Nurek,Szczecin,al. Wiosenna 72


In [4]:
len(users["user_id"].unique())


200

In [5]:
streets = pd.DataFrame()
streets["address"] = users["street"].to_list()
streets["prefix"] = streets["address"].apply(lambda x: x.split(" ")[0])
streets["st"] = streets["address"].apply(lambda x: x.split(" ")[1])
streets["num"] = streets["address"].apply(lambda x: x.split(" ")[2])
streets["city"] = users["city"]

streets.head()

Unnamed: 0,address,prefix,st,num,city
0,pl. Brzoskwiniowa 11/53,pl.,Brzoskwiniowa,11/53,Police
1,al. Wrocławska 10,al.,Wrocławska,10,Police
2,pl. Nowa 89/04,pl.,Nowa,89/04,Mielec
3,ul. Częstochowska 80,ul.,Częstochowska,80,Szczecin
4,al. Wiosenna 72,al.,Wiosenna,72,Szczecin


In [7]:
streets["num"] = streets["num"].apply(lambda x: x.split("/")[0])
streets.head()

Unnamed: 0,address,prefix,st,num,city
0,pl. Brzoskwiniowa 11/53,pl.,Brzoskwiniowa,11,Police
1,al. Wrocławska 10,al.,Wrocławska,10,Police
2,pl. Nowa 89/04,pl.,Nowa,89,Mielec
3,ul. Częstochowska 80,ul.,Częstochowska,80,Szczecin
4,al. Wiosenna 72,al.,Wiosenna,72,Szczecin


In [11]:
import numpy as np
streets["code"] = np.zeros((streets.shape[0],1))

In [12]:
streets.head()

Unnamed: 0,address,prefix,st,num,city,code
0,pl. Brzoskwiniowa 11/53,pl.,Brzoskwiniowa,11,Police,0.0
1,al. Wrocławska 10,al.,Wrocławska,10,Police,0.0
2,pl. Nowa 89/04,pl.,Nowa,89,Mielec,0.0
3,ul. Częstochowska 80,ul.,Częstochowska,80,Szczecin,0.0
4,al. Wiosenna 72,al.,Wiosenna,72,Szczecin,0.0


In [13]:
streets.iloc[[3]]

Unnamed: 0,address,prefix,st,num,city,code
3,ul. Częstochowska 80,ul.,Częstochowska,80,Szczecin,0.0


In [None]:
# city/street/num