In [None]:
import pandas as pd
import glob

from src.visual_utils import plot_data_bar, plot_data_line, plot_data_line_multiple
from src.stats_utils import calculate_retention, print_basic_stats

In [None]:
files = glob.glob("data/raw/*.csv.gz")

# Import data
dfs = [pd.read_csv(f, compression="gzip") for f in files]
df = pd.concat(dfs)

# Convert date values
df["install_date"] = pd.to_datetime(df["install_date"])
df["event_date"] = pd.to_datetime(df["event_date"])

# Add a new column for later use.
df["days_after_install"] = (df["event_date"] - df["install_date"]).dt.days

In [None]:
for column in df.select_dtypes(include="number").columns:
    print_basic_stats(df[column], name=column)

In [None]:
# The player who played the game before installing
df[df["days_after_install"] == -1]

Calculate and visualize daily active user values.

In [None]:
daily_active_users = df.groupby("event_date")["user_id"].nunique()
x = daily_active_users.index
y = daily_active_users.values

plot_data_bar(x, y, 45)

Calculate and visualize retentions

In [None]:
days_to_plot = [1, 3, 7]


x_values_list, y_values_list, labels = [], [], []
for day in days_to_plot:
    retention = calculate_retention(df, day)
    x_values_list.append(retention.index)
    y_values_list.append(retention.values)
    labels.append(f"D{day} retention")

plot_data_line_multiple(x_values_list, y_values_list, labels, 45)

Per session duration for dates

In [None]:
total_session_durations = df.groupby("event_date")["total_session_duration"].sum()
total_session_counts = df.groupby("event_date")["total_session_count"].sum()
plot_data_line(total_session_counts.index, (total_session_durations / total_session_counts).values, x_rotation=45)

Per session duration for days after installing the game.

In [None]:
session_duration_df = df.copy()
session_df = session_duration_df.groupby("days_after_install").agg(
    total_session_duration=("total_session_duration", "sum"),
    total_session_count=("total_session_count", "sum")
)
session_df = session_df[session_df.index > 0]

duration_per_session = session_df["total_session_duration"] / session_df["total_session_count"] 
# Get every 5th value
duration_per_session = duration_per_session[::5]
plot_data_line(duration_per_session.index, duration_per_session.values)