In [46]:
import pandas as pd

df_time_consume = pd.read_csv('data/label_time.csv')
df_time_complete = pd.read_csv('data/label_time_complete.csv')
df_users_top_100 = pd.read_csv('data/label_users_top_100.csv')

df = pd.merge(df_time_consume, df_time_complete, on='id', how='inner')
df = pd.merge(df, df_users_top_100, on='id', how='inner')
df = df.rename(columns={
    'label_users_top_100': 'user_as_index',
    'label_closed': 'user',
    'received_time': 'received',
    'label_bins': 'consumption',
    'time_bins_solution_timestamp': 'solution'
})

df = df[['consumption', 'received', 'solution', 'user', 'user_as_index']]
df = df[df.user_as_index.isin([3, 18, 84])]
# df = df[:2000]

print(df.user.value_counts())
print(df.describe())

alib    13724
ep      13670
rasp    10703
Name: user, dtype: int64
        consumption      received      solution  user_as_index
count  38097.000000  3.809700e+04  3.809700e+04   38097.000000
mean       1.635431  1.560657e+09  1.561090e+09      31.138515
std        1.419082  5.768741e+07  5.776723e+07      33.648785
min        0.000000  1.450250e+09  1.450259e+09       3.000000
25%        0.000000  1.511167e+09  1.511493e+09       3.000000
50%        1.000000  1.556788e+09  1.557225e+09      18.000000
75%        3.000000  1.609747e+09  1.609938e+09      84.000000
max        4.000000  1.664281e+09  1.666011e+09      84.000000


### Time Consumption: time_consume
Interval is in rounded minutes
- 0: 0 - 2
- 1: 2 - 5
- 2: 6 - 10
- 3: 11 - 25
- 4: 26 - 50

### Time Completion: time_complete
Interval is in minutes
- 0: 0 - 4  (240 seconds)
- 1: 4 - 105  (6300 seconds)
- 2: 105 - 3791 (2.6 days) (227460 seconds)
- 3: 3791 - 14633 (10.2 days) (877980 seconds)
- 4: 14633 - 44640 (31 days) (2678400 seconds)

In [47]:
print(len(df.user.value_counts()))

3


For every time-step (received time), give me the workload.

In [48]:
class Workload:
    
    def set(self, key, value):
        self.__setattr__(key, value)

    def get(self, key):
        return self.__getattribute__(key)

In [49]:
from tqdm import tqdm

import numpy as np
import matplotlib.pyplot as plt

columns = ['timestamp']
columns = columns + [f'workload_{i}' for i in list(range(len(df.user.value_counts())))]

counts = []
workloads = []

users = df.user.unique()

last_timestamp = 0

In [50]:
for i, el in tqdm(df.iterrows(), total=len(df)):

    # Give me all elements that comes before and including el.received.
    # Remove all elements that has a solution before and including el.received.
    tmp = df[df.received <= el.received]
    tmp = tmp[tmp.solution >= el.received]

    # Gives a overview over how many open issues there are over time.
    counts.append(len(tmp))

    workload = Workload()

    for user in users:
        tmp_for_user = tmp[tmp.user == user]
        workload.set(user, tmp_for_user.consumption.sum())

    workloads.append(workload)

100%|██████████| 38097/38097 [01:24<00:00, 453.45it/s]


In [1]:
# plt.plot(np.array(counts), label='counts')
for user in users:
    plt.plot([e.get(user) for e in workloads], label=f'{user}')

plt.legend()
plt.show()



NameError: name 'users' is not defined