# Data loading

In [1]:
from google_drive_downloader import GoogleDriveDownloader as gdd

gdd.download_file_from_google_drive(file_id='1nlp2wQAPrNHst7to0vkkz9uHyI4d1-xU',
                                    dest_path='./amz7k.zip')
gdd.download_file_from_google_drive(file_id='10dwiIFVwAyOQrc-4uQT0ZXq8AcvHQncC',
                                    dest_path='./sinth_dset.zip')

!unzip amz7k.zip -d .
!unzip sinth_dset.zip -d .

[1;30;43mВыходные данные были обрезаны до нескольких последних строк (5000).[0m
  inflating: ./sinth_dset/4606.csv   
  inflating: ./sinth_dset/2062.csv   
  inflating: ./sinth_dset/2287.csv   
  inflating: ./sinth_dset/4200.csv   
  inflating: ./sinth_dset/3436.csv   
  inflating: ./sinth_dset/1167.csv   
  inflating: ./sinth_dset/2829.csv   
  inflating: ./sinth_dset/663.csv    
  inflating: ./sinth_dset/1042.csv   
  inflating: ./sinth_dset/749.csv    
  inflating: ./sinth_dset/2558.csv   
  inflating: ./sinth_dset/1518.csv   
  inflating: ./sinth_dset/1313.csv   
  inflating: ./sinth_dset/675.csv    
  inflating: ./sinth_dset/4436.csv   
  inflating: ./sinth_dset/1162.csv   
  inflating: ./sinth_dset/1147.csv   
  inflating: ./sinth_dset/2297.csv   
  inflating: ./sinth_dset/4110.csv   
  inflating: ./sinth_dset/550.csv    
  inflating: ./sinth_dset/2500.csv   
  inflating: ./sinth_dset/4437.csv   
  inflating: ./sinth_dset/4135.csv   
  inflating: ./sinth_dset/3129.csv   
  infl

In [2]:
from typing import List
import os
import tqdm
import torch
import pandas as pd

def load_data(
    path_to_data: str,
    n_types: int,
    return_times = False,
    unix_time = False
    ) -> List[torch.Tensor]:
    files = os.listdir(path_to_data)
    seqs = []
    if 'clusters.csv' in files:
        files.remove('clusters.csv')
    if 'process_params.json' in files:
        files.remove('process_params.json')
    for i, f in tqdm.tqdm(enumerate(files)):
        df = pd.read_csv(path_to_data + '/' + f)
        df = df.sort_values(by=['time'])
        seq = torch.zeros(len(df['time']), n_types + 1)
        for id_, (time, event) in enumerate(zip(list(df['time']), list(df['event']))):
            seq[id_,0] = time
            seq[id_, int(event) + 1] = 1
        if return_times:
            seq[1:, 0] -= seq[:-1, 0]
            seq[0, 0] = 0
        else:
            seq[:,0]-=seq[0,0].clone()
        if unix_time:
            seq[:,0]/=86400
        seqs.append(seq)
    return seqs

# Plotting utils

In [3]:
!apt-get install dvipng texlive-latex-extra texlive-fonts-recommended cm-super

Reading package lists... Done
Building dependency tree       
Reading state information... Done
The following package was automatically installed and is no longer required:
  libnvidia-common-460
Use 'apt autoremove' to remove it.
The following additional packages will be installed:
  cm-super-minimal fonts-droid-fallback fonts-lato fonts-lmodern
  fonts-noto-mono fonts-texgyre ghostscript gsfonts javascript-common
  libcupsfilters1 libcupsimage2 libgs9 libgs9-common libijs-0.35 libjbig2dec0
  libjs-jquery libkpathsea6 libpotrace0 libptexenc1 libruby2.5 libsynctex1
  libtexlua52 libtexluajit2 libzzip-0-13 lmodern pfb2t1c2pfb poppler-data
  preview-latex-style rake ruby ruby-did-you-mean ruby-minitest
  ruby-net-telnet ruby-power-assert ruby-test-unit ruby2.5
  rubygems-integration t1utils tex-common tex-gyre texlive-base
  texlive-binaries texlive-latex-base texlive-latex-recommended
  texlive-pictures texlive-plain-generic tipa
Suggested packages:
  fonts-noto ghostscript-x apache2 | 

In [4]:
!git clone https://github.com/garrettj403/SciencePlots.git
!cd SciencePlots; pip install -e .

Cloning into 'SciencePlots'...
remote: Enumerating objects: 871, done.[K
remote: Counting objects: 100% (259/259), done.[K
remote: Compressing objects: 100% (126/126), done.[K
remote: Total 871 (delta 134), reused 254 (delta 132), pack-reused 612[K
Receiving objects: 100% (871/871), 85.05 MiB | 33.55 MiB/s, done.
Resolving deltas: 100% (463/463), done.
Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Obtaining file:///content/SciencePlots
  Installing build dependencies ... [?25l[?25hdone
  Getting requirements to build wheel ... [?25l[?25hdone
  Installing backend dependencies ... [?25l[?25hdone
    Preparing wheel metadata ... [?25l[?25hdone
Installing collected packages: SciencePlots
  Running setup.py develop for SciencePlots
Successfully installed SciencePlots


In [5]:
import matplotlib.pyplot as plt


def save_formatted(fig, ax, settings, save_path, xlabel=None, ylabel=None, title=None):
    """
    Saves (fig, ax) object to save_path with settigns json file
    """
    # labels and title
    if settings["show labels"]:
        ax.set_xlabel(xlabel)
        ax.set_ylabel(ylabel)
    else:
        ax.set_xlabel(None)
        ax.set_ylabel(None)
    if settings["show title"]:
        plt.title(title)
    #
    ax.tick_params(
        axis="both", which="major", labelsize=settings["tick labels font size"]
    )
    ax.tick_params(
        axis="both", which="minor", labelsize=2 * settings["tick labels font size"] // 3
    )
    # set height and width
    fig.set_figheight(settings["fig height"])
    fig.set_figwidth(settings["fig width"])
    # set aspect ratio
    x0, x1 = ax.get_xlim()
    y0, y1 = ax.get_ylim()
    ax.set_aspect(settings["aspect ratio"] * abs((x1 - x0) / (y1 - y0)))
    # save
    plt.savefig(save_path, dpi=settings["dpi"], bbox_inches="tight")
    plt.clf()

In [14]:
settings = {
	"aspect ratio": 1.00,
	"dpi": 400,
	"fig height": 7,
	"fig width": 5,
	"tick labels font size": 10,
	"style": "science",
	"show labels": True,
	"show title": True
}

In [15]:
plt.style.available

['Solarize_Light2',
 '_classic_test_patch',
 'bmh',
 'bright',
 'cjk-jp-font',
 'cjk-kr-font',
 'cjk-sc-font',
 'cjk-tc-font',
 'classic',
 'dark_background',
 'fast',
 'fivethirtyeight',
 'ggplot',
 'grayscale',
 'grid',
 'high-contrast',
 'high-vis',
 'ieee',
 'latex-sans',
 'light',
 'muted',
 'nature',
 'no-latex',
 'notebook',
 'pgf',
 'retro',
 'russian-font',
 'scatter',
 'science',
 'seaborn',
 'seaborn-bright',
 'seaborn-colorblind',
 'seaborn-dark',
 'seaborn-dark-palette',
 'seaborn-darkgrid',
 'seaborn-deep',
 'seaborn-muted',
 'seaborn-notebook',
 'seaborn-paper',
 'seaborn-pastel',
 'seaborn-poster',
 'seaborn-talk',
 'seaborn-ticks',
 'seaborn-white',
 'seaborn-whitegrid',
 'std-colors',
 'tableau-colorblind10',
 'vibrant']

In [16]:
plt.style.reload_library()

# Sinthetic

In [17]:
data = load_data('sinth_dset', 5)

5000it [00:29, 169.94it/s]


In [18]:
import numpy as np
import torch
import matplotlib.pyplot as plt

In [20]:
"""Plots sinthetic dataset counting function ignoring event types"""

import matplotlib.pyplot as plt
from matplotlib import ticker

with plt.style.context(settings["style"]):
    fig, axs = plt.subplots()
    formatter = ticker.ScalarFormatter(useMathText=True)
    formatter.set_scientific(True)
    formatter.set_powerlimits((-2, 2))
    axs.yaxis.set_major_formatter(formatter)
    for i, seq in enumerate(data):
        if i == 100:
          break
        times = torch.arange(0,100,0.01)
        counts = torch.Tensor([torch.sum(seq[:,0]<=t) for t in times])
        axs.plot(times, counts)
    save_formatted(
        fig,
        axs,
        settings,
        save_path="sinth_count_func.pdf",
        xlabel="time",
        ylabel="count",
        title="Sinthetic Dataset Counting Function",
    )

<Figure size 360x504 with 0 Axes>

In [22]:
"""Plots sinthetic dataset counting function for each event type"""

for j in range(5):
    with plt.style.context(settings["style"]):
        fig, axs = plt.subplots()
        formatter = ticker.ScalarFormatter(useMathText=True)
        formatter.set_scientific(True)
        formatter.set_powerlimits((-2, 2))
        axs.yaxis.set_major_formatter(formatter)
        for i, seq in enumerate(data):
            if i == 100:
              break

            times = torch.arange(0,100,0.01)
            tmp = seq[seq[:,j+1] == 1, 0]
            counts = torch.Tensor([torch.sum(tmp<=t) for t in times])
            axs.plot(times, counts)
        save_formatted(
            fig,
            axs,
            settings,
            save_path=f"sinth_count_func_{j}_event_type.pdf",
            xlabel="time",
            ylabel="count",
            title=f"Sinthetic Dataset {j} Event Type Counting Function",
        )
    


<Figure size 360x504 with 0 Axes>

<Figure size 360x504 with 0 Axes>

<Figure size 360x504 with 0 Axes>

<Figure size 360x504 with 0 Axes>

<Figure size 360x504 with 0 Axes>

In [26]:
"""Plots sinthetic dataset empirical probability density function vs poisson process probability density funciton with lambda obtained from the dataset"""

return_times = torch.concat([d[1:,0] - d[:-1,0] for d in data])
lambda_ = 1/torch.mean(return_times)
times = torch.arange(0,10,0.01)
pdf = lambda_*torch.exp(-lambda_*times)

with plt.style.context(settings["style"]):
    fig, axs = plt.subplots()
    formatter = ticker.ScalarFormatter(useMathText=True)
    formatter.set_scientific(True)
    formatter.set_powerlimits((-2, 2))
    axs.yaxis.set_major_formatter(formatter)
    axs.hist(return_times, bins=1000, density=True, label="Empirical Distribution")
    axs.plot(times, pdf, label="Poisson Process Distribution")
    axs.legend(loc="upper right", bbox_to_anchor=(1,1))
    save_formatted(
        fig,
        axs,
        settings,
        save_path="sinth_pdf.pdf",
        xlabel="time",
        ylabel="pdf",
        title="Sinthetic Dataset Probability Density Function",
    )

<Figure size 360x504 with 0 Axes>

In [27]:
"""Plots sinthetic dataset empirical survival function vs poisson process survival funciton with lambda obtained from the dataset"""

times = torch.arange(0,1.5,0.01)
survival = torch.Tensor([torch.sum(return_times>t)/len(return_times) for t in times])
true_survival = torch.exp(-lambda_*times)

with plt.style.context(settings["style"]):
    fig, axs = plt.subplots()
    formatter = ticker.ScalarFormatter(useMathText=True)
    formatter.set_scientific(True)
    formatter.set_powerlimits((-2, 2))
    axs.yaxis.set_major_formatter(formatter)
    axs.plot(times, survival, label="Empirical Survival")
    axs.plot(times, true_survival, label="Poisson Process Survival")
    axs.legend(loc="upper right", bbox_to_anchor=(1,1))
    save_formatted(
        fig,
        axs,
        settings,
        save_path="sinth_sf.pdf",
        xlabel="time",
        ylabel="survival probability",
        title="Sinthetic Dataset Survival Function",
    )

<Figure size 360x504 with 0 Axes>

# Amazon

In [28]:
data = load_data('amz7k', 8, unix_time = True)

7523it [00:34, 215.97it/s]


In [29]:
"""Plots amazon dataset counting function ignoring event types"""

import matplotlib.pyplot as plt
from matplotlib import ticker

with plt.style.context(settings["style"]):
    fig, axs = plt.subplots()
    formatter = ticker.ScalarFormatter(useMathText=True)
    formatter.set_scientific(True)
    formatter.set_powerlimits((-2, 2))
    axs.yaxis.set_major_formatter(formatter)
    for i, seq in enumerate(data):
        if i == 100:
          break
        times = torch.arange(0,1000,0.1)
        counts = torch.Tensor([torch.sum(seq[:,0]<=t) for t in times])
        axs.plot(times, counts)
    save_formatted(
        fig,
        axs,
        settings,
        save_path="amz_count_func.pdf",
        xlabel="time",
        ylabel="count",
        title="Amazon Dataset Counting Function",
    )

<Figure size 360x504 with 0 Axes>

In [30]:
"""Plots amazon dataset counting function for each event type"""

for j in range(8):
    with plt.style.context(settings["style"]):
        fig, axs = plt.subplots()
        formatter = ticker.ScalarFormatter(useMathText=True)
        formatter.set_scientific(True)
        formatter.set_powerlimits((-2, 2))
        axs.yaxis.set_major_formatter(formatter)
        for i, seq in enumerate(data):
            if i == 100:
              break

            times = torch.arange(0,1000,0.1)
            tmp = seq[seq[:,j+1] == 1, 0]
            counts = torch.Tensor([torch.sum(tmp<=t) for t in times])
            axs.plot(times, counts)
        save_formatted(
            fig,
            axs,
            settings,
            save_path=f"amz_count_func_{j}_event_type.pdf",
            xlabel="time",
            ylabel="count",
            title=f"Amazon Dataset {j} Event Type Counting Function",
        )

<Figure size 360x504 with 0 Axes>

<Figure size 360x504 with 0 Axes>

<Figure size 360x504 with 0 Axes>

<Figure size 360x504 with 0 Axes>

<Figure size 360x504 with 0 Axes>

<Figure size 360x504 with 0 Axes>

<Figure size 360x504 with 0 Axes>

<Figure size 360x504 with 0 Axes>

In [36]:
"""Plots amazon dataset empirical probability density function excluding outliers vs poisson process probability density funciton with lambda obtained from the dataset"""

return_times = torch.concat([d[1:,0] - d[:-1,0] for d in data])
lambda_ = 1/torch.mean(return_times)
return_times = return_times[return_times < 100]
times = torch.arange(0,100,0.01)
pdf = lambda_*torch.exp(-lambda_*times)

with plt.style.context(settings["style"]):
    fig, axs = plt.subplots()
    formatter = ticker.ScalarFormatter(useMathText=True)
    formatter.set_scientific(True)
    formatter.set_powerlimits((-2, 2))
    axs.yaxis.set_major_formatter(formatter)
    axs.hist(return_times, bins=100, density=True, label="Empirical Distribution")
    axs.plot(times, pdf, label="Poisson Process Distribution")
    axs.legend(loc="upper right", bbox_to_anchor=(1,1))
    save_formatted(
        fig,
        axs,
        settings,
        save_path="amz_pdf.pdf",
        xlabel="time",
        ylabel="pdf",
        title="Amazon Dataset Probability Density Function",
    )

<Figure size 360x504 with 0 Axes>

In [37]:
"""Plots amazon dataset empirical survival function excluding outliers vs poisson process survival funciton with lambda obtained from the dataset"""

times = torch.arange(0,100,0.01)
survival = torch.Tensor([torch.sum(return_times>=t)/len(return_times) for t in times])
true_survival = torch.exp(-lambda_*times)

with plt.style.context(settings["style"]):
    fig, axs = plt.subplots()
    formatter = ticker.ScalarFormatter(useMathText=True)
    formatter.set_scientific(True)
    formatter.set_powerlimits((-2, 2))
    axs.yaxis.set_major_formatter(formatter)
    axs.plot(times, survival, label="Empirical Survival")
    axs.plot(times, true_survival, label="Poisson Process Survival")
    axs.legend(loc="upper right", bbox_to_anchor=(1,1))
    save_formatted(
        fig,
        axs,
        settings,
        save_path="amz_sf.pdf",
        xlabel="time",
        ylabel="survival probability",
        title="Amazon Dataset Survival Function",
    )

<Figure size 360x504 with 0 Axes>