In [1]:
import polars as pl
import sys

# File paths
input_file = r"C:\Study\SYRACUSE\RA\Assignment 1\period_03\2024_tw_posts_president_scored_anon.csv"
output_file = "Output_Polar_tw_posts.txt"

def print_section(title):
    print(f"\n{'='*10} {title} {'='*10}")

def safe_cast_numeric(df, cols):
    """Casts selected columns to Float64, skipping any missing ones."""
    to_cast = [pl.col(c).cast(pl.Float64, strict=False) for c in cols if c in df.columns]
    return df.with_columns(to_cast)

def describe_group(df, group_name):
    """Describes statistics for a filtered group."""
    print(f"\nGroup: {group_name}")
    print(df.describe())

def main():
    with open(output_file, 'w', encoding='utf-8') as f:
        sys.stdout = f

        df = pl.read_csv(input_file)

        numeric_cols = ["retweetCount", "replyCount", "likeCount", "quoteCount", "viewCount"]
        df = safe_cast_numeric(df, numeric_cols)

        print_section("Header and First 5 Rows")
        print("Columns:", df.columns)
        print(df.head())

        print_section("Descriptive Statistics for Entire Dataset")
        print(df.describe())

        print_section("Top 3 Sources by Count")
        top_sources = df.group_by("source").len().sort("len", descending=True).head(3)
        print(top_sources)

        print_section("Grouped by source (First 3)")
        for i, source in enumerate(df.select("source").unique().to_series().to_list()):
            if i >= 3:
                break
            group = df.filter(pl.col("source") == source)
            describe_group(group, f"source = {source}")

        print_section("Grouped by source + id (First 3)")
        unique_combos = df.select(["source", "id"]).unique()
        for i in range(min(3, unique_combos.height)):
            source = unique_combos[i, "source"]
            tweet_id = unique_combos[i, "id"]
            group = df.filter((pl.col("source") == source) & (pl.col("id") == tweet_id))
            describe_group(group, f"source = {source}, id = {tweet_id}")

        print_section("Script Completed")
        sys.stdout = sys.__stdout__

if __name__ == "__main__":
    main()
