#### Importing Necessary Libraries

In [None]:
# Necessary Libraries
import os
import pandas as pd
import numpy as np
from googleapiclient.discovery import build
from datetime import datetime
from datetime import timezone
from dateutil import parser
from pathlib import Path

In [None]:
# Importing the api_key from the text file

file_dir = r"C:\Users\External Boot\Downloads\Documents\supporting_files\api_key.txt"
with open(file_dir, "r") as f:
    API_KEY = f.read().strip()

print("API Key Loaded:", API_KEY[:8] + "******")

In [None]:
# Setting up the api service name and api version in the builder
youtube = build('youtube','v3', developerKey=API_KEY)

#### Getting The YouTube Channels

In [None]:
# Creating a query list to search for the relevant channels
queries = [
    "data science",
    "AI",
    "artificial intelligence",
    "machine learning",
    "deep learning",
    "generative AI",
    "LLM",
    "neural networks",
    "MLOps",
    "data engineering",
    
    # additional fields
    "computer vision",
    "natural language processing",
    "reinforcement learning",
    "big data",
    "cloud computing",
    "edge AI",
    "AI ethics",
    "data visualization",
    "predictive analytics",
    "data mining",
    "robotics",
    "automation",
    "cybersecurity AI",
    "quantum computing",
    "AI in healthcare",
    "AI in finance",
    "AI in education",
    "AI governance",
    "AutoML",
    "feature engineering",
    "data pipelines",
    "AI research",
    "AI startups",
    "AI tools and frameworks"
]

# Updated queries
queries_updt = [x.strip().lower() for x in queries]

In [None]:
# User defined function to get the list of channels and the respective ids
def get_youtube_channels(
        # query="data science",   #select your niche (Data Dcience, AI, Coding, Technology, etc)
        queries,
        max_per_page=50,
        pages=50   
):
    channels = {}
    next_page = None

    # looping to search and get the relevant channel info
    for query in queries:
        print(f"Searching for: {query}")

        next_page = None                     
        for _ in range(pages):
            response = youtube.search().list(
                q=query,
                type="channel",
                part="snippet",
                order="relevance",
                maxResults=max_per_page,
                pageToken=next_page
            ).execute()

            for item in response.get("items", []):
                channel_id = item["snippet"]["channelId"]
                title = item["snippet"]["title"].strip()
                channels[channel_id] = title 

            next_page = response.get("nextPageToken")
            if not next_page:
                break

    # Sorting the channels alphabetically 
    sorted_channels = dict(sorted(channels.items(), key=lambda x: x[1].lower()))
    return sorted_channels

In [None]:
# Quick check on the areas of interest
for i in queries_updt:
    print(i)

In [None]:
# Creating a dataframe with channel id and title
req_channels = get_youtube_channels(queries_updt, pages=50)

channels_df = pd.DataFrame([
    {"channel_title": title, "channel_id": cid}
    for cid, title in req_channels.items()
])

In [None]:
# Quick check on the channels
channels_df.info()

In [None]:
# Defining the new directory and filename
output_dir = Path('C:/Users/...') #select your own path
output_filename = 'channels.csv'
output_filepath = output_dir/output_filename
print(output_filepath)

# Creating the directory if it doesn't exist
output_dir.mkdir(parents=True, exist_ok=True)

# Saving the DataFrame to CSV in the new location
channels_df.to_csv(output_filepath, index=False)
print("Channels Saved Succesfully!")

#### Fetching Relevant Data -- Looping Channel Ids

In [None]:
# UDF to fetch the channel info from youtube
def get_channel_info(channel_id):
    request = youtube.channels().list(
        part="snippet,statistics,contentDetails",
        id=channel_id
    )
    response = request.execute()
    if not response["items"]:
        return None

    item = response["items"][0]
    snippet = item["snippet"]
    stats = item["statistics"]

    # Dates
    start_date = snippet["publishedAt"]
    start_date_dt = parser.isoparse(start_date)
    now = datetime.now(timezone.utc)
    age_days = (now - start_date_dt).days
    years_active = age_days // 365

    # Stats
    subs = int(stats.get("subscriberCount", 0))
    total_videos = int(stats.get("videoCount", 0))
    total_views = int(stats.get("viewCount", 0))

    # Derived metrics
    uploads_per_week = round(total_videos / (age_days / 7), 2) if (total_videos > 0 & age_days>0) else 0
    avg_views = round(total_views / total_videos, 2) if total_videos > 0 else 0
    avg_likes, avg_comments = get_avg_engagement(channel_id)

    # Milestone
    reached_250k = subs >= 250000
    time_to_250k_days = None
    if reached_250k:
        growth_rate = subs / age_days if age_days > 0 else 0
        time_to_250k_days = round(250000 / growth_rate, 2) if growth_rate > 0 else None

    return {
        "channel_id": channel_id,
        "channel_title": snippet.get("title", ""),
        "niche": snippet.get("title", ""),  # placeholder for classification
        "start_date": start_date,
        "current_subscriber_count": subs,
        "years_active": years_active,
        "channel_age_days": age_days,
        "total_videos": total_videos,
        "avg_views": avg_views,
        "avg_likes": avg_likes,
        "avg_comments": avg_comments,
        "uploads_per_week": uploads_per_week,
        "reached_250k": reached_250k,
        "time_to_250k_days": time_to_250k_days,
        
        # Simulation inputs
        "growth_rate_per_day": subs / age_days if age_days > 0 else 0,
        "engagement_rate": (avg_likes + avg_comments) / avg_views if avg_views > 0 else 0
    }

In [None]:
# UDF to generate additional fields
def get_avg_engagement(channel_id, sample_size=50):
    request = youtube.channels().list(
        part="contentDetails",
        id=channel_id
    )
    response = request.execute()
    items = response.get("items", [])
    if not items:
        return 0, 0

    uploads_playlist_id = items[0]["contentDetails"]["relatedPlaylists"].get("uploads")
    if not uploads_playlist_id:
        return 0, 0

    video_ids = []
    next_page_token = None
    try:
        while len(video_ids) < sample_size:
            req = youtube.playlistItems().list(
                part="contentDetails",
                playlistId=uploads_playlist_id,
                maxResults=50,
                pageToken=next_page_token
            )
            res = req.execute()
            for item in res.get("items", []):
                video_ids.append(item["contentDetails"]["videoId"])
            next_page_token = res.get("nextPageToken")
            if not next_page_token:
                break
    except Exception as e:
        print(f"Error fetching playlist for {channel_id}: {e}")
        return 0, 0

    likes, comments, count = 0, 0, 0
    for i in range(0, len(video_ids), 50):
        req = youtube.videos().list(
            part="statistics",
            id=",".join(video_ids[i:i+50])
        )
        res = req.execute()
        for item in res.get("items", []):
            stats = item["statistics"]
            likes += int(stats.get("likeCount", 0))
            comments += int(stats.get("commentCount", 0))
            count += 1

    avg_likes = round(likes / count, 2) if count > 0 else 0
    avg_comments = round(comments / count, 2) if count > 0 else 0
    return avg_likes, avg_comments

In [None]:
# UDF to build the final dataframe
def build_channel_dataframe(channel_ids, save_path):
    data = []
    for idx, cid in enumerate(channel_ids, start=1):
        print(f"[{idx}/{len(channel_ids)}] Fetching data for channel: {cid}")
        try:
            info = get_channel_info(cid)
            if info:
                data.append(info)
                # Save progress after each successful fetch
                df = pd.DataFrame(data)
                df.to_csv(save_path, index=False)
        except Exception as e:
            print(f"Error fetching {cid}: {e}")
            # continue to next channel without breaking
            continue

    # Final DataFrame
    return pd.DataFrame(data)

In [None]:
# Example usage
channel_ids = list(channels_df['channel_id'])
print("Data fetching for {}  channels".format(len(channel_ids)))

In [None]:
# Defining the new directory and filename
output_dir = Path('C:/Users/...') #select your own path
output_filename = 'channels_data_progressive.csv'
output_filepath = output_dir/output_filename
print(output_filepath)

# Creating the directory if it doesn't exist
output_dir.mkdir(parents=True, exist_ok=True)

In [None]:
# Final dataframe creation
df = build_channel_dataframe(channel_ids, output_filepath)
print(df.head())

In [None]:
# Saving the DataFrame to CSV in the new location
df.to_csv(output_filepath, index=False)

In [None]:
# Quick check on the dataframe info
df.info()

In [None]:
from dateutil import parser

def add_start_year(df: pd.DataFrame) -> pd.DataFrame:
    start_year = pd.to_datetime(df["start_date"], format="ISO8601").dt.year
    return df.assign(start_year=start_year)

def cohort_summary(
    df: pd.DataFrame,
    sim_results: pd.DataFrame,
    cohort_by: str = "niche"
) -> pd.DataFrame:
    """
    Summarize metrics by cohort (e.g., niche, start_year, uploads_per_week bucket).
    """
    merged = df.merge(sim_results, on="channel_id", how="left")

    # Optional: bucket uploads_per_week for cohorting
    if cohort_by == "uploads_bucket":
        bins = [0, 0.5, 1, 2, 4, 10, float("inf")]
        labels = ["<=0.5", "0.5-1", "1-2", "2-4", "4-10", ">10"]
        merged["uploads_bucket"] = pd.cut(merged["uploads_per_week"], bins=bins, labels=labels, right=True)
        group_col = "uploads_bucket"
    else:
        group_col = cohort_by

    summary = (
        merged.groupby(group_col)
        .agg(
            current_subs_median=("current_subscriber_count", "median"),
            growth_rate_day_median=("growth_rate_per_day", "median"),
            engagement_rate_median=("engagement_rate", "median"),
            prob_250k_median=("prob_250k_in_horizon", "median"),
            prob_250k_mean=("prob_250k_in_horizon", "mean"),
            expected_end_subs_median=("expected_end_subs", "median"),
            channels=("channel_id", "nunique")
        )
        .sort_values("prob_250k_median", ascending=False)
        .reset_index()
    )
    return summary

In [None]:
# # Prepare
# df = add_start_year(df)

# # Run Monte Carlo
# sim_df = monte_carlo_probability(df, horizon_days=365*3, sims=5000, variability_ratio=0.25, use_lognormal=False)

# # Cohort by niche
# niche_cohort = cohort_summary(df, sim_df, cohort_by="niche")
# print(niche_cohort.head())

# # Cohort by start_year
# year_cohort = cohort_summary(df, sim_df, cohort_by="start_year")
# print(year_cohort.head())

# # Cohort by uploads intensity bucket
# uploads_cohort = cohort_summary(df, sim_df, cohort_by="uploads_bucket")
# print(uploads_cohort.head())

In [None]:
# Prepare
df_tmp = add_start_year(df_tmp)

In [None]:
df_tmp['start_year'].value_counts()

In [None]:
df_tmp['start_date'].head()

In [None]:
now = datetime.now(timezone.utc)
pd.to_datetime(now,format="ISO8601").date

In [None]:
dt_tmp['age_days']= (pd.to_datetime(now,format="ISO8601").date - pd.to_datetime(df["start_date"], format="ISO8601").dt.date).days

In [None]:
df_tmp["age_days"] = (pd.to_datetime(now) - pd.to_datetime(df_tmp["start_date"], format="ISO8601")).dt.days

In [None]:
df_tmp.head()

In [None]:
# df_tmp['uplds_per_week'] = round(df_tmp['total_videos'] / (df_tmp['age_days'] / 7), 2) if (df_tmp['total_videos'] > 0 & df_tmp['age_days']>0) else 0

In [None]:
df_tmp["uplds_per_week"] = df_tmp.apply(
    lambda row: round(row["total_videos"] * 7 / row["age_days"], 2)
    if (row["total_videos"] > 0 and row["age_days"] > 0)
    else 0,
    axis=1
)

In [None]:
df_tmp['uplds_per_week'].describe()

In [None]:
df_tmp['uploads_per_week'] = df_tmp['uplds_per_week']

In [None]:
df_tmp['uploads_per_week'].describe()

In [None]:
# Cohort by niche
niche_cohort = cohort_summary(df, sim_df, cohort_by="niche")
print(niche_cohort.head())

In [None]:
# Cohort by start_year
year_cohort = cohort_summary(df_tmp, sim_df, cohort_by="start_year")
print(year_cohort.head())

In [None]:
year_cohort.head()

In [None]:
year_cohort

In [None]:
# Cohort by uploads intensity bucket
uploads_cohort = cohort_summary(df_tmp, sim_df, cohort_by="uploads_bucket")
print(uploads_cohort.head())

In [None]:
uploads_cohort

In [None]:
df_tmp.head()

In [None]:
from scipy.stats import gamma

def bayesian_lambda_posterior(
    df: pd.DataFrame,
    prior_strength_days: int = 30,
    variability_ratio: float = 0.25
) -> pd.DataFrame:
    """
    Gamma–Poisson Bayesian update for daily growth rate λ per channel.
    We form a prior using per-channel mean growth and variability proxy.

    Prior: λ ~ Gamma(a0, b0)
    Posterior with aggregated evidence: λ ~ Gamma(a_post, b_post)

    - prior_strength_days: pseudo-count of historical days influencing the prior.
    - variability_ratio: used to shape dispersion of prior around mean.
    """
    mean_growth = df["growth_rate_per_day"].to_numpy(dtype=float)
    # Construct prior parameters:
    # Set prior mean = mean_growth, and prior variance = (variability_ratio * mean)^2
    # For Gamma: mean = a/b, var = a/b^2 -> choose b then a = mean * b
    # We tie b to prior_strength_days to scale certainty.
    # Let b0 = prior_strength_days / mean_growth (avoid zero division)
    mean_safe = np.clip(mean_growth, 1e-6, None)
    b0 = prior_strength_days / mean_safe
    a0 = mean_safe * b0

    # Aggregate "observed" evidence approximated by current age and mean gains:
    # Effective counts: total gains ≈ mean_growth * channel_age_days
    total_days = df["channel_age_days"].to_numpy(dtype=float)
    total_gains = mean_growth * total_days

    # Posterior parameters:
    a_post = a0 + total_gains
    b_post = b0 + total_days

    # Posterior summaries:
    post_mean = a_post / b_post
    post_var = a_post / (b_post ** 2)
    post_std = np.sqrt(post_var)

    out = pd.DataFrame({
        "channel_id": df["channel_id"],
        "lambda_prior_a": a0,
        "lambda_prior_b": b0,
        "lambda_post_a": a_post,
        "lambda_post_b": b_post,
        "lambda_post_mean": post_mean,
        "lambda_post_std": post_std
    })
    return out

def bayesian_prob_250k(
    df: pd.DataFrame,
    posterior_df: pd.DataFrame,
    horizon_days: int = 365 * 3,
    sims: int = 5000,
    random_state: int | None = 123
) -> pd.DataFrame:
    """
    Draw λ from posterior Gamma for each channel and simulate horizon gains.
    Compute probability of reaching 250k and credible intervals.
    """
    rng = np.random.default_rng(random_state)
    merged = df.merge(posterior_df, on="channel_id", how="left")

    current = merged["current_subscriber_count"].to_numpy(dtype=float)
    a = merged["lambda_post_a"].to_numpy(dtype=float)
    b = merged["lambda_post_b"].to_numpy(dtype=float)

    # Sample lambda per sim: shape (n_channels, sims)
    lam_samples = rng.gamma(shape=a[:, None], scale=1.0 / b[:, None], size=(len(df), sims))
    # Predictive expected gains ~ Poisson(λ * horizon). For computation efficiency, use Normal approx for large means or sample Poisson directly:
    gains = rng.poisson(lam_samples * horizon_days)
    ending = current[:, None] + gains
    hit = ending >= 250_000

    prob = hit.mean(axis=1)
    q10 = np.quantile(ending, 0.10, axis=1)
    q50 = np.quantile(ending, 0.50, axis=1)
    q90 = np.quantile(ending, 0.90, axis=1)

    out = pd.DataFrame({
        "channel_id": merged["channel_id"],
        "bayes_prob_250k_in_horizon": prob,
        "bayes_end_subs_q10": q10,
        "bayes_end_subs_q50": q50,
        "bayes_end_subs_q90": q90,
        "horizon_days": horizon_days,
        "sims": sims
    })
    return out


In [None]:
# Posterior for lambda per channel
posterior_df = bayesian_lambda_posterior(df, prior_strength_days=30, variability_ratio=0.25)

# Bayesian probabilities of hitting 250k
bayes_res = bayesian_prob_250k(df, posterior_df, horizon_days=365*3, sims=5000)

# Merge everything
sim_df = monte_carlo_probability(df, horizon_days=365*3, sims=5000)
full_res = df[["channel_id", "niche", "current_subscriber_count"]].merge(sim_df, on="channel_id").merge(bayes_res, on="channel_id")
print(full_res.head())

In [None]:
import numpy as np
import pandas as pd

def monte_carlo_probability(
    df: pd.DataFrame,
    horizon_days: int = 365 * 3,
    sims: int = 5000,
    variability_ratio: float = 0.25,
    use_lognormal: bool = False,
    random_state: int | None = 42
) -> pd.DataFrame:
    """
    Vectorized Monte Carlo simulation of subscriber growth.
    Returns a DataFrame with per-channel probabilities and summary stats.
    
    Parameters:
    - variability_ratio: scales std around mean daily growth (e.g., 0.25 -> 25% of mean).
    - use_lognormal: if True, model daily gains as Lognormal; else Gaussian with floor at 0.
    """
    rng = np.random.default_rng(random_state)

    channels = df["channel_id"].to_numpy()
    current = df["current_subscriber_count"].to_numpy(dtype=float)
    mean_growth = df["growth_rate_per_day"].to_numpy(dtype=float)

    # Scale variability by engagement and uploads for richer dynamics
    engagement = np.clip(df["engagement_rate"].to_numpy(dtype=float), 0, None)
    uploads = np.clip(df["uploads_per_week"].to_numpy(dtype=float), 0.01, None)
    # Normalized modifiers
    eng_mod = 1.0 + 0.5 * (engagement / (engagement.mean() + 1e-9))
    upl_mod = 1.0 + 0.3 * (uploads / (uploads.mean() + 1e-9))

    std_growth = np.clip(mean_growth * variability_ratio * eng_mod * upl_mod, 0.0, None)

    n = len(channels)
    # Shape: (n_channels, sims, horizon_days)
    if use_lognormal:
        # Lognormal params from mean and std (approximation)
        mu = np.log(np.clip(mean_growth, 1e-6, None)) - 0.5 * np.log1p((std_growth / np.clip(mean_growth, 1e-6, None)) ** 2)
        sigma = np.sqrt(np.log1p((std_growth / np.clip(mean_growth, 1e-6, None)) ** 2))
        daily = rng.lognormal(mean=mu[:, None, None], sigma=sigma[:, None, None], size=(n, sims, horizon_days))
    else:
        daily = rng.normal(loc=mean_growth[:, None, None], scale=std_growth[:, None, None], size=(n, sims, horizon_days))
        daily = np.clip(daily, 0.0, None)

    cumulative_gains = daily.sum(axis=2)  # (n, sims)
    ending_subs = current[:, None] + cumulative_gains
    hit_250k = ending_subs >= 250_000

    prob = hit_250k.mean(axis=1)
    # Expected days to hit 250k (median over successful paths); fallback to NaN if never hit
    # Compute first passage time for each sim:
    threshold = 250_000 - current[:, None]
    cum_daily = daily.cumsum(axis=2)  # (n, sims, t)
    reached = cum_daily >= threshold[:, :, None]  # bool
    # For each channel, sim: first time index reaching threshold
    first_hit_idx = reached.argmax(axis=2)  # returns 0 if never reached; need mask
    ever_hit = reached.any(axis=2)
    # Days to hit (1-based indexing for day count)
    days_to_hit = np.where(ever_hit, first_hit_idx + 1, np.nan)
    # Median across sims per channel
    median_days_to_hit = np.nanmedian(days_to_hit, axis=1)

    # Expected ending subs and quantiles
    expected_end = ending_subs.mean(axis=1)
    q10 = np.quantile(ending_subs, 0.10, axis=1)
    q50 = np.quantile(ending_subs, 0.50, axis=1)
    q90 = np.quantile(ending_subs, 0.90, axis=1)

    out = pd.DataFrame({
        "channel_id": channels,
        "prob_250k_in_horizon": prob,
        "median_days_to_250k": median_days_to_hit,
        "expected_end_subs": expected_end,
        "end_subs_q10": q10,
        "end_subs_q50": q50,
        "end_subs_q90": q90,
        "horizon_days": horizon_days,
        "sims": sims
    })
    return out

In [None]:
# # Run Monte Carlo
sim_df = monte_carlo_probability(df_tmp, horizon_days=365*3, sims=500, variability_ratio=0.25, use_lognormal=False)

In [None]:
sim_df.head()

In [None]:
import pandas as pd
import matplotlib.pyplot as plt
from lifelines import KaplanMeierFitter

def kaplan_meier_analysis(df, milestone=250_000, cohort_col=None):
    """
    Perform Kaplan–Meier survival analysis on channels reaching a milestone.
    
    Parameters:
    - df: DataFrame with at least ['channel_id', 'channel_age_days', 'current_subscriber_count']
    - milestone: subscriber milestone (default 250k)
    - cohort_col: optional column name to stratify curves (e.g., 'niche', 'start_year')
    
    Returns:
    - kmf objects (dict if stratified), survival plot
    """
    # Event flag: has channel reached milestone?
    df["event"] = df["current_subscriber_count"] >= milestone
    df["duration"] = df["channel_age_days"]

    plt.figure(figsize=(10, 6))

    if cohort_col and cohort_col in df.columns:
        kmf_dict = {}
        for cohort, group in df.groupby(cohort_col):
            kmf = KaplanMeierFitter()
            kmf.fit(durations=group["duration"], event_observed=group["event"], label=str(cohort))
            kmf.plot_survival_function(ci_show=True)
            kmf_dict[cohort] = kmf
        plt.title(f"Kaplan–Meier Survival Curves by {cohort_col}")
    else:
        kmf = KaplanMeierFitter()
        kmf.fit(durations=df["duration"], event_observed=df["event"], label=f"Milestone {milestone}")
        kmf.plot_survival_function(ci_show=True)
        plt.title(f"Kaplan–Meier Survival Curve (Milestone {milestone})")
        kmf_dict = {"overall": kmf}

    plt.xlabel("Channel Age (days)")
    plt.ylabel("Probability of NOT reaching milestone")
    plt.grid(True)
    plt.show()

    return kmf_dict

In [None]:
# Suppose df is your channel DataFrame with channel_age_days and current_subscriber_count
# Add a start_year column for cohort analysis
df["start_year"] = pd.to_datetime(df["start_date"]).dt.year

# Overall survival curve
kmf_overall = kaplan_meier_analysis(df)

# Stratified by niche
kmf_by_niche = kaplan_meier_analysis(df, cohort_col="niche")

# Stratified by start year
kmf_by_year = kaplan_meier_analysis(df, cohort_col="start_year")

In [None]:
def combine_all_results(
    df_base: pd.DataFrame,
    mc_df: pd.DataFrame,
    bayes_df: pd.DataFrame,
    cohort_df: pd.DataFrame,
    km_df: pd.DataFrame,
    w_mc: float = 0.35,
    w_bayes: float = 0.35,
    w_km: float = 0.20,
    w_expected: float = 0.10
) -> pd.DataFrame:
    """
    Combine Monte Carlo, Bayesian, Cohort, and Kaplan–Meier results
    into a single final probability score.
    """

    # Merge everything
    merged = (
        df_base[["channel_id", "current_subscriber_count"]]
        .merge(mc_df, on="channel_id", how="left")
        .merge(bayes_df, on="channel_id", how="left")
        .merge(km_df, on="channel_id", how="left")
        .merge(cohort_df, on="channel_id", how="left")
    )

    # Normalize expected_end_subs to 0–1
    exp_min = merged["expected_end_subs"].min()
    exp_max = merged["expected_end_subs"].max()
    merged["expected_norm"] = (
        (merged["expected_end_subs"] - exp_min) / (exp_max - exp_min + 1e-9)
    )

    # Cohort uplift multiplier (normalized)
    if "cohort_score" in merged.columns:
        cmin = merged["cohort_score"].min()
        cmax = merged["cohort_score"].max()
        merged["cohort_norm"] = (
            (merged["cohort_score"] - cmin) / (cmax - cmin + 1e-9)
        )
    else:
        merged["cohort_norm"] = 1.0

    # Kaplan–Meier probability of reaching 250K
    merged["km_prob_250k"] = 1 - merged["km_survival_prob"]

    # Final probability score (geometric weighted mean)
    merged["final_probability_score"] = (
        (merged["prob_250k_in_horizon"] ** w_mc) *
        (merged["bayes_prob_250k_in_horizon"] ** w_bayes) *
        (merged["km_prob_250k"] ** w_km) *
        (merged["expected_norm"] ** w_expected) *
        (merged["cohort_norm"] ** 0.10)  # small uplift from cohort
    )

    merged["final_probability_score"] = merged["final_probability_score"].clip(0, 1)

    return merged


In [None]:
final_df = combine_all_results(
    df_base=df,
    mc_df=mc_results,
    bayes_df=bayes_results,
    cohort_df=cohort_results,     
    km_df=km_results              
)

final_df.sort_values("final_probability_score", ascending=False).head(20)