In [None]:
# Necessary Libraries
import os
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from googleapiclient.discovery import build
from lifelines import KaplanMeierFitter
from datetime import datetime
from scipy.stats import gamma
from datetime import timezone
from dateutil import parser
from pathlib import Path

In [None]:
# Importing the youtube data
file_path = r"C:\Users\...\youtube_data_4K_channels.csv" #choose your own file path
df = pd.read_csv(file_path)

In [None]:
# Importing the monte-carlo simulation data
mc_file_path = r"C:\Users\...\mc_simulation_data.csv" #choose your own file path
sim_df = pd.read_csv(mc_file_path)

In [None]:
# Constructing the km estimator function
def kaplan_meier_analysis(df, milestone=250_000, cohort_col=None):
    """
    Perform Kaplan–Meier survival analysis on channels reaching a milestone.
    Also returns per-channel survival probabilities (km_survival_prob).
    
    Returns:
    - kmf_dict: dict of KM fitters (overall or stratified)
    - km_results: DataFrame with ['channel_id', 'km_survival_prob']
    """

    df = df.copy()

    # Event flag: has channel reached milestone?
    df["event"] = df["current_subscriber_count"] >= milestone
    df["duration"] = df["channel_age_days"]

    plt.figure(figsize=(12, 6))

    kmf_dict = {}
    km_results_list = []

    # --- STRATIFIED KM ---
    if cohort_col and cohort_col in df.columns:
        for cohort, group in df.groupby(cohort_col):
            kmf = KaplanMeierFitter()
            kmf.fit(
                durations=group["duration"],
                event_observed=group["event"],
                label=str(cohort)
            )
            kmf.plot_survival_function(ci_show=True)
            kmf_dict[cohort] = kmf

            # Evaluate survival probability at each channel's age
            surv_probs = kmf.survival_function_at_times(group["duration"]).values
            km_results_list.append(pd.DataFrame({
                "channel_id": group["channel_id"].values,
                "km_survival_prob": surv_probs,
                cohort_col: cohort
            }))

        plt.title(f"Kaplan–Meier Survival Curves by {cohort_col}")

    # --- OVERALL KM ---
    else:
        kmf = KaplanMeierFitter()
        kmf.fit(
            durations=df["duration"],
            event_observed=df["event"],
            label=f"Milestone {milestone}"
        )
        kmf.plot_survival_function(ci_show=True)
        kmf_dict["overall"] = kmf

        surv_probs = kmf.survival_function_at_times(df["duration"]).values
        km_results_list.append(pd.DataFrame({
            "channel_id": df["channel_id"].values,
            "km_survival_prob": surv_probs
        }))

        plt.title(f"Kaplan–Meier Survival Curve (Milestone {milestone})")

    plt.xlabel("Channel Age (days)")
    plt.ylabel("Probability of NOT reaching milestone")
    plt.grid(True)
    plt.show()

    # Combine results
    km_results = pd.concat(km_results_list, ignore_index=True)

    return kmf_dict, km_results

In [None]:
# Function for adding an year column
def add_start_year(df: pd.DataFrame) -> pd.DataFrame:
    start_year = pd.to_datetime(df["start_date"], format="ISO8601").dt.year
    return df.assign(start_year=start_year)

In [None]:
# Adding the year column to the dataframe
df = add_start_year(df)
df.info()

In [None]:
# Overall survival curve
kmf_overall = kaplan_meier_analysis(df)

In [None]:
# Stratified by start year
kmf_by_year = kaplan_meier_analysis(df, cohort_col="start_year")

In [None]:
# Saving the results as a dataframe 
output_dir = Path('C:/Users/.../YT Analysis Data') #choose your own file path
output_filename = 'kaplan_meier_estimate_data.csv'
output_filepath = output_dir/output_filename
print(output_filepath)

# Creating the directory if it doesn't exist
output_dir.mkdir(parents=True, exist_ok=True)

# Saving the DataFrame to CSV in the new location
km_results.to_csv(output_filepath, index=False)
print("Channels Saved Succesfully!")