In [None]:
import pandas as pd
import glob

from src.visual_utils import plot_by_segment
from src.stats_utils import find_best_performing_countries

In [None]:
files = glob.glob("data/raw/*.csv.gz")

# Import data
dfs = [pd.read_csv(f, compression="gzip") for f in files]
df = pd.concat(dfs)

# Convert date values
df["install_date"] = pd.to_datetime(df["install_date"])
df["event_date"] = pd.to_datetime(df["event_date"])

# Add new columns for later use.
df["days_after_install"] = (df["event_date"] - df["install_date"]).dt.days
df["total_revenue"] = df["ad_revenue"] + df["iap_revenue"]

In [None]:
# Find countries where ad revenue surpasses iap revenue

# Aggregate revenues at country level
temp = df.groupby("country").agg(
    ad_revenue=("ad_revenue", "sum"),
    iap_revenue=("iap_revenue", "sum"),
    total_revenue=("total_revenue", "sum"),
)

# Filter out small markets
temp = temp[temp["total_revenue"] > 100]
temp[temp["ad_revenue"] > temp["iap_revenue"]]

The code below finds the best performing countries in given criteria column and function.

For example: Criteria column "user_id" and criteria function "nunique" will count unique number of entries in "user_id" column and return highest `country_count` of them.

In [None]:
country_count = 5
criteria_column = "user_id"
criteria_function = "nunique"

# Clamp the value between 0 and country count
country_count = max(0, min(country_count, df["country"].nunique()))
countries = find_best_performing_countries(df, criteria_column, country_count, criteria_function)
countries

In [None]:
def per_session_duration(segment_df: pd.DataFrame) -> pd.Series:
    total_session_durations = segment_df.groupby("event_date")["total_session_duration"].sum()
    total_session_counts = segment_df.groupby("event_date")["total_session_count"].sum()
    return total_session_durations / total_session_counts

plot_by_segment(df, countries, "country", per_session_duration)