# Introduction
- This notebook covers:
    - Loading and sorting of the `toolwindow_data.csv`
    - Session reconstruction
    - Quick overview of the newly generated `sessions.csv`

# Imports

In [None]:
import pandas as pd
import plotly.express as px

# Load and Sort Data

In [None]:
df = pd.read_csv(r'data/toolwindow_data.csv')
df['datetime'] = pd.to_datetime(df['timestamp'], unit='ms')
df = df.sort_values(['user_id', 'datetime'])
df

# Session Reconstruction by matching Open/Close pairs

In [None]:
sessions = []

for user_id, group in df.groupby('user_id'):
    open_stack = []

    for _, row in group.iterrows():
        if row['event'] == 'opened':
            open_stack.append((row['datetime'], row['open_type']))

        elif row['event'] == 'closed' and open_stack:
            open_time, open_type = open_stack.pop()
            duration = (row['datetime'] - open_time).total_seconds()

            if 0 < duration < 36000: # remove negatives and >10 hours
                sessions.append({
                    'user_id': user_id,
                    'open_time': open_time,
                    'close_time': row['datetime'],
                    'open_type': open_type,
                    'duration_sec': duration
                    })

In [None]:
sessions_df = pd.DataFrame(sessions)
sessions_df.to_csv(r'data/sessions.csv', index=False)
sessions_df

# Quick Overview

In [None]:
sessions_df.info()

In [None]:
print("Total sessions:", len(sessions_df))

In [None]:
sessions_df['open_type'].value_counts()

In [None]:
sessions_df.groupby('open_type')['duration_sec'].describe()

In [None]:
fig = px.box(sessions_df, x='open_type', y='duration_sec', points='all',
             title='Duration Distribution by Open Type', color='open_type')

fig.update_layout(xaxis_title='Open Type', yaxis_title='Duration (seconds)')

fig.show()