In [None]:
import pandas as pd
from pathlib import Path

def load_cookie_cats(
    local_path: str = "cookie_cats.csv",
    github_repo: str = "TahaFurkanTorun/Cookie-Cats-A-B-Testing",
    ref: str = "dc8928c4c56a8d895ae0542919af3ac03623e7b3",
    filename: str = "cookie_cats.csv",
    verbose: bool = False,
) -> tuple:
    """
    Load the Cookie Cats dataset from a local file if present,
    otherwise fetch from GitHub raw URL.

    Returns (df, source) where source is 'local' or the raw URL used.
    """
    p = Path(local_path)
    if p.exists():
        df = pd.read_csv(p)
        if verbose:
            print(f"Loaded '{local_path}' from local filesystem.")
        return df, "local"

    raw_url = f"https://raw.githubusercontent.com/{github_repo}/{ref}/{filename}"
    df = pd.read_csv(raw_url)
    if verbose:
        print(f"Loaded '{filename}' from GitHub: {raw_url}")
    return df, raw_url

# Load dataframe once (silent by default)
df, source = load_cookie_cats()


# Cookie Cats A/B Test Analysis

This notebook contains the analysis of an A/B test conducted in the mobile game Cookie Cats. The goal of the test was to determine if moving the first gate from level 30 to level 40 would imp[...]

In [None]:
import pandas as pd
import matplotlib.pyplot as plt
from scipy.stats import chi2_contingency

# NOTE: 'df' is already loaded by the first cell (df, source = load_cookie_cats()).

# Display initial information
print("First rows of the dataset:")
print(df.head())
print("\n" + "=" * 50 + "\n")

print("General information about the dataset:")
df.info()

print("\n" + "=" * 50 + "\n")

# Number of players in each version
print("Number of players in each version:")
print(df["version"].value_counts())
print("\n" + "=" * 50 + "\n")

# Distribution of gamerounds played
print("Distribution of gamerounds played:")
df["sum_gamerounds"].plot.box(figsize=(10, 5), title="Distribution of gamerounds played")
plt.show()

# Value of the outlier
print("The maximum value of sum_gamerounds is:")
print(df['sum_gamerounds'].max())

# Remove the outlier
df_cleaned = df[df['sum_gamerounds'] < 15000]

# Distribution of game rounds played (after removing the outlier)
print("\nDistribution of game rounds played (after removing the outlier):")
df_cleaned['sum_gamerounds'].plot.box(figsize=(5, 10), title="Distribution of Game Rounds Played (Outlier Removed)")
plt.show()

# Descriptive statistics for sum_gamerounds
print("\nDescriptive statistics for sum_gamerounds (outlier removed):")
print(df_cleaned['sum_gamerounds'].describe())

# Descriptive statistics for sum_gamerounds by version
print("\nDescriptive statistics for sum_gamerounds by version (outlier removed):")
print(df_cleaned.groupby('version')['sum_gamerounds'].describe())

# Calculate retention rates by version
retention_by_version = (df.groupby('version')[[ 'retention_1', 'retention_7']].mean() * 100).round(3)

print("\nRetention rates by version (%):")
print(retention_by_version)

# Chi-squared test for 1-day retention
contingency_table_1 = pd.crosstab(df['version'], df['retention_1'])
chi2_1, p_1, dof_1, expected_1 = chi2_contingency(contingency_table_1)
print("\nContingency table for 1-day retention:")
print(contingency_table_1)
print(f"Chi-Squared Statistic For 1-day retention: {chi2_1:.4f}")
print(f"P-value: {p_1:.4f}")

# Chi-squared test for 7-day retention
contingency_table_7 = pd.crosstab(df['version'], df['retention_7'])
chi2_7, p_7, dof_7, expected_7 = chi2_contingency(contingency_table_7)
print("\nContingency table for 7-day retention:")
print(contingency_table_7)
print(f"Chi-Squared Statistic For 7-day retention: {chi2_7:.4f}")
print(f"P-value: {p_7:.4f}")

# Plot 7-day retention by version
retention_by_version["retention_7"].plot.bar(title='7-Day Retention by AB-Test Version')
plt.ylabel('Retention Rate (%)')
plt.xlabel("Version")
plt.xticks(rotation=0)
plt.show()


## Summary and Conclusion

Based on the analysis:

* The 1-day retention rates for 'gate_30' and 'gate_40' are very similar (44.8% vs 44.2%), and the chi-squared test shows no statistically significant difference (p-value = 0.0755 > 0.05).
* The 7-day retention rates show a slightly larger difference (19.0% for 'gate_30' vs 18.2% for 'gate_40'), and the chi-squared test indicates a statistically significant difference (p-value < 0.05).

The statistically significant lower 7-day retention for 'gate_40' suggests that moving the gate to level 40 negatively impacts long-term player retention.

**Recommendation:** Based on this analysis, it is recommended to **keep the gate at level 30**.