In [None]:
"""
polars_stats_tw_posts_president_tabulate.py
Clean descriptive stats for Twitter posts using Polars + tabulate.
ALL OUTPUT SAVED TO TXT FILE ONLY.
"""

import polars as pl
import sys
from tabulate import tabulate


filepath = r'C:\Users\vrush\Music\RA\period_03\2024_tw_posts_president_scored_anon.csv'
output_path = 'Output_Polars_tw_posts_stats.txt'

group_columns = ['source']

def print_section(title):
    line = f"\n{'='*10} {title} {'='*10}\n"
    print(line)

def format_table(df):
    return tabulate(df, headers='keys', tablefmt='github', showindex=True)

def build_desc(df: pl.DataFrame):
    # Start with Polars describe
    desc = df.describe()

    # Add mean, min, max rounded to 2 decimals
    desc = desc.with_columns([
        pl.col("mean").round(2).alias("Mean"),
        pl.col("min").round(2).alias("Min"),
        pl.col("max").round(2).alias("Max")
    ]).select(["column", "Mean", "Min", "Max"])

    
    unique = df.n_unique().to_dict(False)
    most_freq = []
    freq_cnt = []
    for col in df.columns:
        if df[col].dtype == pl.Utf8:
            vc = df[col].value_counts().sort("counts", descending=True)
            if vc.height > 0:
                most_freq.append(vc[0, col])
                freq_cnt.append(vc[0, "counts"])
            else:
                most_freq.append("-")
                freq_cnt.append("-")
        else:
            most_freq.append("-")
            freq_cnt.append("-")

    desc = desc.with_columns([
        pl.Series("Unique", [unique.get(col, "-") for col in desc["column"]]),
        pl.Series("Most Frequent", most_freq),
        pl.Series("Freq Cnt", freq_cnt)
    ])

    return desc.to_pandas()

def main():
    df = pl.read_csv(filepath)

    original_stdout = sys.stdout
    with open(output_path, 'w', encoding='utf-8') as f:
        sys.stdout = f  # redirect all output

        
        print_section('Header')
        print(f"Header: {df.columns}\n")

        
        print_section('Descriptive Statistics for Entire Dataset')
        desc = build_desc(df)
        print(format_table(desc) + "\n")

        
        print_section(f'Grouped by {group_columns} (All Groups)')
        try:
            unique_keys = df.select(group_columns).unique().rows()
            for key in unique_keys:
                key_str = key[0] if len(key) == 1 else key
                group = df.filter(pl.col('source') == key[0])
                print(f"\nGroup: {key_str} (Count: {group.height})")
                gdesc = build_desc(group)
                print(format_table(gdesc) + "\n")
        except pl.ComputeError as e:
            print(f"\n❌ Polars error: {e}")
            print(f"Available columns: {df.columns}")

        print_section('Script Completed')
        print('All Polars stats saved in readable format.\n')

        sys.stdout = original_stdout

if __name__ == "__main__":
    main()
