In [2]:
"""
pandas_stats_tw_posts_president_tabulate.py
Clean descriptive stats for Twitter posts dataset using Pandas + tabulate.
ALL OUTPUT SAVED TO TXT FILE ONLY.
"""

import pandas as pd
import sys
from tabulate import tabulate


pd.set_option('display.max_colwidth', 100)

filepath = r'C:\Users\vrush\Music\RA\period_03\2024_tw_posts_president_scored_anon.csv'
output_path = 'Output_Pandas_tw_posts_pandas_stats.txt'


group_columns = ['source']

def print_section(title):
    line = f"\n{'='*10} {title} {'='*10}\n"
    print(line)

def format_table(df):
    return tabulate(df, headers='keys', tablefmt='github', showindex=True)

def build_desc(df):
    desc = df.describe(include='all').transpose()
    desc['unique'] = df.nunique()
    desc['most_freq'] = df.apply(lambda x: x.value_counts().index[0] if not x.value_counts().empty else "-")
    desc['freq_count'] = df.apply(lambda x: x.value_counts().iloc[0] if not x.value_counts().empty else "-")

    desc = desc[['mean', 'min', 'max', 'unique', 'most_freq', 'freq_count']]
    desc = desc.rename(columns={
        'mean': 'Mean',
        'min': 'Min',
        'max': 'Max',
        'unique': 'Unique',
        'most_freq': 'Most Frequent',
        'freq_count': 'Freq Cnt'
    })

    for col in ['Mean', 'Min', 'Max']:
        desc[col] = pd.to_numeric(desc[col], errors='coerce').round(2)

    desc = desc.fillna('-')
    return desc

def main():
    df = pd.read_csv(filepath)

    
    original_stdout = sys.stdout
    with open(output_path, 'w', encoding='utf-8') as f:
        sys.stdout = f  # Everything goes to file now

        
        print_section('Header')
        print(f"Header: {list(df.columns)}\n")

        
        print_section('Descriptive Statistics for Entire Dataset')
        desc = build_desc(df)
        print(format_table(desc) + "\n")

        
        print_section(f'Grouped by {group_columns} (All Groups)')
        try:
            for key, group in df.groupby(group_columns):
                print(f"\nGroup: {key} (Count: {len(group)})")
                gdesc = build_desc(group)
                print(format_table(gdesc) + "\n")
        except KeyError as e:
            print(f"\n❌ KeyError: {e}")
            print(f"Available columns: {list(df.columns)}")

        print_section('Script Completed')
        print('All stats saved in readable format.\n')

        sys.stdout = original_stdout  # Restore normal stdout

if __name__ == "__main__":
    main()
