In [None]:
import pandas as pd
import pymc as pm
import numpy as np
import matplotlib.pyplot as plt
import arviz as az

def load_and_analyze_data(file_path):
    # Load data
    data = pd.read_csv(file_path)
    
    # Data Overview
    print("Dataset Overview:")
    print("-" * 50)
    print(f"Total Samples: {len(data)}")
    print(f"Versions Distribution:\n{data['version'].value_counts()}")
    
    # Retention Summary
    retention_summary = data.groupby('version')[['retention_1', 'retention_7']].mean()
    print("\nRetention Rates:")
    print(retention_summary)
    
   
    def run_retention_analysis(retention_type):
        gate_30_retention = data[data['version'] == 'gate_30'][retention_type]
        gate_40_retention = data[data['version'] == 'gate_40'][retention_type]
        
        with pm.Model() as retention_model:
            p_30 = pm.Uniform(f'p_30_{retention_type}', lower=0, upper=1)
            p_40 = pm.Uniform(f'p_40_{retention_type}', lower=0, upper=1)
            
            obs_30 = pm.Bernoulli(f'obs_30_{retention_type}', p=p_30, observed=gate_30_retention)
            obs_40 = pm.Bernoulli(f'obs_40_{retention_type}', p=p_40, observed=gate_40_retention)
            
            difference = pm.Deterministic('difference', p_30 - p_40)
            
            trace = pm.sample(20000, chains=2)
        
        return trace
    
    # Visualization Function
    def visualize_retention_analysis(trace, retention_type):
        # Posterior Distributions for Gate Probabilities
        plt.figure(figsize=(12.5, 4))
        
        p_30_samples = np.concatenate(trace.posterior[f'p_30_{retention_type}'].data[:,1000:])
        p_40_samples = np.concatenate(trace.posterior[f'p_40_{retention_type}'].data[:,1000:])
        
        plt.hist(p_30_samples, bins=40, label='Posterior p_30', density=True, alpha=0.7)
        plt.hist(p_40_samples, bins=40, label='Posterior p_40', density=True, alpha=0.7)
        plt.title(f'{retention_type.capitalize()} Retention Probabilities')
        plt.legend()
        plt.show()
        
        # Difference Distribution
        difference_samples = np.concatenate(trace.posterior['difference'].data[:,1000:])
        
        plt.figure(figsize=(12.5, 4))
        plt.hist(difference_samples, bins=40, label='Posterior difference', density=True)
        plt.title(f'Difference in {retention_type.capitalize()} Retention')
        plt.axvline(x=0, color='r', linestyle='--')
        plt.legend()
        plt.show()
        
        # Print Summary
        print(f"\n{retention_type.capitalize()} Retention Analysis:")
        print(az.summary(trace, hdi_prob=0.95))
        
        # Probability of Superiority
        prob_superior = np.mean(difference_samples > 0)
        print(f"\nProbability that Gate_30 is Superior: {prob_superior:.2%}")
        
        return prob_superior
    
    # Perform Analysis for 1-day and 7-day retention
    print("\n1-DAY RETENTION ANALYSIS")
    trace_1day = run_retention_analysis('retention_1')
    prob_1day = visualize_retention_analysis(trace_1day, 'retention_1')
    
    print("\n7-DAY RETENTION ANALYSIS")
    trace_7day = run_retention_analysis('retention_7')
    prob_7day = visualize_retention_analysis(trace_7day, 'retention_7')

# Execute the analysis
if __name__ == "__main__":
    file_path = "https://raw.githubusercontent.com/dustywhite7/Econ8310/master/AssignmentData/cookie_cats.csv"
    load_and_analyze_data(file_path)

Initializing NUTS using jitter+adapt_diag...


Dataset Overview:
--------------------------------------------------
Total Samples: 90189
Versions Distribution:
version
gate_40    45489
gate_30    44700
Name: count, dtype: int64

Retention Rates:
         retention_1  retention_7
version                          
gate_30     0.448188     0.190201
gate_40     0.442283     0.182000

1-DAY RETENTION ANALYSIS


Multiprocess sampling (2 chains in 2 jobs)
NUTS: [p_30_retention_1, p_40_retention_1]


Output()