- Chat: https://chatgpt.com/share/6736edd3-6104-800e-8648-0ce4e4240d29
- Source data: http://td.winnerstudio.vip:8996/#/tga/ide/59_32514

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from IPython.display import display, HTML
# https://chatgpt.com/share/6736edd3-6104-800e-8648-0ce4e4240d29

# Function to display DataFrame in Jupyter Notebook
def display_dataframe_to_user(name, dataframe):
    """
    Display a DataFrame in a user-friendly HTML format in Jupyter notebooks.
    
    Args:
        name (str): Title for the displayed DataFrame.
        dataframe (pd.DataFrame): The DataFrame to display.
    """
    style = """
    <style>
        table {
            width: 100%;
            border-collapse: collapse;
            margin-bottom: 1em;
        }
        th, td {
            border: 1px solid #ddd;
            padding: 8px;
            text-align: left;
        }
        th {
            background-color: #f2f2f2;
            font-weight: bold;
        }
    </style>
    """
    display(HTML(f"{style}<h3>{name}</h3>"))
    display(dataframe)

# Load your dataset
file_path = 'C:/Users/Win11/Downloads/20241115_060823_50567_v56n5.csv' #http://td.winnerstudio.vip:8996/#/tga/ide/59_32514
data = pd.read_csv(file_path)

# Data Processing
# 1. Filter for users with day_7_net_amount_sum > 0
filtered_data = data[data['day_7_net_amount_sum'] > 0]

# 2. Identify the top 6 games with the greatest number of users
top_games = (
    filtered_data['game']
    .value_counts()
    .head(6)
    .index
)

# 3. Filter the dataset to include only these top games
top_games_data = filtered_data[filtered_data['game'].isin(top_games)]

# 4. Add a 'Treatment' column based on nth_game
top_games_data['Treatment'] = top_games_data['nth_game'].apply(
    lambda x: 'Recycled user' if x > 1 else 'New user'
)

# 5. Calculate the average day_7_net_amount_sum for each game and treatment
grouped_data = (
    top_games_data.groupby(['game', 'Treatment'])
    .agg({'day_7_net_amount_sum': 'mean'})
    .reset_index()
)

# 6. Ensure games are sorted by their occurrence count for consistency
grouped_data['game'] = pd.Categorical(
    grouped_data['game'],
    categories=top_games,
    ordered=True
)

# Display the grouped DataFrame
display_dataframe_to_user(name="Grouped Data for Visualization", dataframe=grouped_data)

# Visualization
unique_games = grouped_data['game'].unique()
num_games = len(unique_games)
cols = int(np.ceil(np.sqrt(num_games)))  # Number of columns for a square-like layout
rows = int(np.ceil(num_games / cols))   # Rows based on the number of columns

plt.figure(figsize=(14, 12))

for i, game in enumerate(unique_games, start=1):
    subset = grouped_data[grouped_data['game'] == game]
    
    # Extract New user and Recycled user values
    new_user_value = subset[subset['Treatment'] == 'New user']['day_7_net_amount_sum'].values[0]
    recycled_user_value = subset[subset['Treatment'] == 'Recycled user']['day_7_net_amount_sum'].values[0]
    
    # Calculate percentage difference
    percentage_diff = ((recycled_user_value - new_user_value) / new_user_value) * 100
    
    # Set up subplot
    plt.subplot(rows, cols, i)
    bars = plt.bar(subset['Treatment'], subset['day_7_net_amount_sum'], color=['#007bff', '#e56b00'], width=0.6)
    
    # Add value labels
    for bar in bars:
        height = bar.get_height()
        plt.text(bar.get_x() + bar.get_width() / 2, height, f'{height:.2f}', ha='center', va='bottom', fontsize=9)
    
    # Title with percentage difference
    diff_color = 'green' if percentage_diff > 0 else 'red'
    plt.title(
        f'{game}\n% Difference: {percentage_diff:.1f}%', 
        fontsize=12, weight='bold', pad=10, color=diff_color
    )
    
    # Axes labels
    plt.xlabel('User Type', fontsize=10)
    plt.ylabel('Avg Day 7 Net Amount', fontsize=10)
    
    # Format ticks
    plt.xticks(fontsize=9)
    plt.yticks(fontsize=9)
    plt.grid(False)

# Adjust layout for readability
plt.tight_layout()
plt.show()
