In [None]:
import pandas as pd
import glob

from src.visual_utils import plot_by_segment
from src.stats_utils import calculate_retention

In [None]:
files = glob.glob("data/raw/*.csv.gz")

# Import data
dfs = [pd.read_csv(f, compression="gzip") for f in files]
df = pd.concat(dfs)

# Convert date values
df["install_date"] = pd.to_datetime(df["install_date"])
df["event_date"] = pd.to_datetime(df["event_date"])

# Add a new column for later use.
df["days_after_install"] = (df["event_date"] - df["install_date"]).dt.days

The following code divides the data into equal segment bins. 

`segments` array can be modified to introduce new segments or remove some. The code is designed to handle the rest.

In [None]:

segments = ["Low", "Medium", "High"]

# Filter out sessions played for less than 100 seconds
engagement_df = df[(df["days_after_install"] == 0) & (df["total_session_duration"] > 100)]
# Create segments
engagement_df["segment"] = pd.qcut(
    engagement_df["total_session_duration"],
    q=len(segments),
    labels=segments
)

if "segment" in df.columns:
    df = df.drop(columns=["segment"])

df = df.merge(
    engagement_df[["user_id", "segment"]],
    on="user_id",
    how="left"
)

Below calculates and plots retention values for different segments.  

Parameters that can be changed for further inspection:  
- `days_to_plot`: Content of this array is the day counts to plot the retention for.

In [None]:
days_to_plot = [1, 3, 7]

for day_number in days_to_plot:
    plot_by_segment(
        df,
        segments,
        "segment",
        compute_series=lambda segment_df, d=day_number: calculate_retention(segment_df, day_number),
        title=f"D{day_number} retention for segments: [{", ".join(segments)}] First Day Engagements"
    )


These are session related functions.

In [None]:
def total_session_duration_segmented(segment_df: pd.DataFrame) -> pd.Series:
    return segment_df.groupby("event_date")["total_session_duration"].sum()

def per_session_duration(segment_df: pd.DataFrame) -> pd.Series:
    total_session_durations = segment_df.groupby("event_date")["total_session_duration"].sum()
    total_session_counts = segment_df.groupby("event_date")["total_session_count"].sum()
    return total_session_durations / total_session_counts

In [None]:
plot_by_segment(
    df,
    segments,
    "segment",
    compute_series=total_session_duration_segmented,
    title=f"Total session duration for segments: {', '.join(segments)} First Day Engagements"
)

plot_by_segment(
    df,
    segments,
    "segment",
    compute_series=per_session_duration,
    title=f"Per session duration for segments: {', '.join(segments)} First Day Engagements"
)

These are revenue related functions.

In [None]:
def iap_revenue(segment_df: pd.DataFrame) -> pd.Series:
    return segment_df.groupby("event_date")["iap_revenue"].sum()

def ad_revenue(segment_df: pd.DataFrame) -> pd.Series:
    return segment_df.groupby("event_date")["ad_revenue"].sum()

In [None]:
plot_by_segment(
    df,
    segments,
    "segment",
    compute_series=iap_revenue,
    title=f"IAP Revenue for segments: {', '.join(segments)} First Day Engagements"
)

plot_by_segment(
    df,
    segments,
    "segment",
    compute_series=ad_revenue,
    title=f"Ad Revenue for segments: {', '.join(segments)} First Day Engagements"
)

These are match related functions.

In [None]:
def ending_match_percentage(segment_df: pd.DataFrame) -> pd.Series:
    match_start_count = segment_df.groupby("event_date")["match_start_count"].sum()
    match_end_count = segment_df.groupby("event_date")["match_end_count"].sum()
    return (match_end_count / match_start_count * 100)

def winning_match_percentage(segment_df: pd.DataFrame) -> pd.Series:
    victory = segment_df.groupby("event_date")["victory_count"].sum()
    ended = segment_df.groupby("event_date")["match_end_count"].sum()
    return (victory / ended * 100)

In [None]:
plot_by_segment(
    df,
    segments,
    "segment",
    compute_series=ending_match_percentage,
    title=f"Ending matches percentage for segments: {', '.join(segments)} First Day Engagements"
)

plot_by_segment(
    df,
    segments,
    "segment",
    compute_series=winning_match_percentage,
    title=f"Winning ended matches percentage for segments: {', '.join(segments)} First Day Engagements"
)