In [None]:
# Necessary Libraries
import os
import pandas as pd
import numpy as np
from googleapiclient.discovery import build
from datetime import datetime
from datetime import timezone
from dateutil import parser
from pathlib import Path

In [None]:
# Importing the youtube data
file_path = r"C:\Users\...\youtube_data_4K_channels.csv" #choose your own file path
df = pd.read_csv(file_path)

In [None]:
# Importing the monte-carlo simulation data
mc_file_path = r"C:\Users\...\mc_simulation_data.csv" #choose your own file path
sim_df = pd.read_csv(mc_file_path)

In [None]:
# Constructing functions for column creation and summarization
def add_start_year(df: pd.DataFrame) -> pd.DataFrame:
    start_year = pd.to_datetime(df["start_date"], format="ISO8601").dt.year
    return df.assign(start_year=start_year)

# For cohort summary
def cohort_summary(
    df: pd.DataFrame,
    sim_results: pd.DataFrame,
    cohort_by: str = "niche"
) -> pd.DataFrame:
    """
    Summarize metrics by cohort (e.g., niche, start_year, uploads_per_week bucket).
    """
    merged = df.merge(sim_results, on="channel_id", how="left")

    # Optional: bucket uploads_per_week for cohorting
    if cohort_by == "uploads_bucket":
        bins = [0, 0.5, 1, 2, 4, 10, float("inf")]
        labels = ["<=0.5", "0.5-1", "1-2", "2-4", "4-10", ">10"]
        merged["uploads_bucket"] = pd.cut(merged["uploads_per_week"], bins=bins, labels=labels, right=True)
        group_col = "uploads_bucket"
    else:
        group_col = cohort_by

    summary = (
        merged.groupby(group_col)
        .agg(
            current_subs_median=("current_subscriber_count", "median"),
            growth_rate_day_median=("growth_rate_per_day", "median"),
            engagement_rate_median=("engagement_rate", "median"),
            prob_250k_median=("prob_250k_in_horizon", "median"),
            prob_250k_mean=("prob_250k_in_horizon", "mean"),
            expected_end_subs_median=("expected_end_subs", "median"),
            channels=("channel_id", "nunique")
        )
        .sort_values("prob_250k_median", ascending=False)
        .reset_index()
    )
    return summary

In [None]:
# Adding start_year column
df = add_start_year(df)
df.info()

In [None]:
# Adding the age of the channel in days format
now = datetime.now(timezone.utc)
pd.to_datetime(now, format="ISO8601").date
df["age_days"] = (pd.to_datetime(now) - pd.to_datetime(df["start_date"], format="ISO8601")).dt.days
df.info()

In [None]:
# Fixing the uploads_per_week column
df["uploads_per_week"] = df.apply(
    lambda row: round(row["total_videos"] * 7 / row["age_days"], 2)
    if (row["total_videos"] > 0 and row["age_days"] > 0)
    else 0,
    axis=1
)

In [None]:
# Cohort by start_year
year_cohort = cohort_summary(df, sim_df, cohort_by="start_year")

In [None]:
# Cohort by uploads intensity bucket
uploads_cohort = cohort_summary(df, sim_df, cohort_by="uploads_bucket")

In [None]:
# Quick look at the dataframe
year_cohort.sort_values('prob_250k_mean', ascending=False).head(15)

In [None]:
# Quick look at the dataframe
year_cohort.sort_values('engagement_rate_median', ascending=False).head(15)

In [None]:
# Quick look at the dataframe
uploads_cohort.head(10)

In [None]:
# Saving the results as a dataframe 
output_dir = Path('C:/Users/.../YT Analysis Data') #choose your own file path
output_filename = 'cohort_data.csv'
output_filepath = output_dir/output_filename
print(output_filepath)

# Creating the directory if it doesn't exist
output_dir.mkdir(parents=True, exist_ok=True)

# Saving the DataFrame to CSV in the new location
df.to_csv(output_filepath, index=False)
print("Channels Saved Succesfully!")