In [3]:
import csv
from collections import Counter, defaultdict
from math import sqrt
import sys

def load_csv(file_path):
    with open(file_path, mode='r', encoding='utf-8') as f:
        reader = csv.DictReader(f)
        data = list(reader)
        return reader.fieldnames, data

def is_float(val):
    try:
        float(val)
        return True
    except (ValueError, TypeError):
        return False

def detect_numeric_columns(header, data, threshold=0.8):
    numeric_cols = set()
    for col in header:
        values = [row[col] for row in data if row[col].strip() not in ("", "NA", "N/A")]
        if not values:
            continue
        numeric = sum(1 for v in values if is_float(v))
        if numeric / len(values) >= threshold:
            numeric_cols.add(col)
    return numeric_cols

def compute_stats(header, data, numeric_cols, group_name=None):
    lines = []
    if group_name:
        lines.append(f"\n===== Descriptive Statistics for group: {group_name} =====")
    else:
        lines.append("\n===== Descriptive Statistics for Entire Dataset =====")

    stats_rows = []
    for col in header:
        col_data = [row[col] for row in data if row[col].strip() not in ("", "NA", "N/A")]

        if col in numeric_cols:
            numbers = [float(v) for v in col_data if is_float(v)]
            if numbers:
                count = len(numbers)
                mean = round(sum(numbers) / count, 2)
                min_val = min(numbers)
                max_val = max(numbers)
                std = round(sqrt(sum((x - mean) ** 2 for x in numbers) / count), 2) if count > 1 else 0.0
                stats_rows.append([col, count, mean, min_val, max_val, std, "", ""])
            else:
                stats_rows.append([col, 0, "", "", "", "", "", ""])
        else:
            count = len(col_data)
            freq = Counter(col_data)
            most_common = freq.most_common(1)[0] if freq else ("", 0)
            stats_rows.append([col, count, "", "", "", "", len(freq), f"{most_common[0]} (Count: {most_common[1]})"])

    headers = ["Column", "Count", "Mean", "Min", "Max", "Std", "Unique", "Most Frequent"]
    col_widths = [max(len(str(row[i])) for row in ([headers] + stats_rows)) for i in range(len(headers))]

    header_line = " | ".join(str(headers[i]).ljust(col_widths[i]) for i in range(len(headers)))
    sep_line = "-+-".join("-" * col_widths[i] for i in range(len(headers)))
    lines.append(header_line)
    lines.append(sep_line)
    for row in stats_rows:
        line = " | ".join(str(row[i]).ljust(col_widths[i]) for i in range(len(row)))
        lines.append(line)
    return "\n".join(lines)

def group_by_column(data, column):
    grouped = defaultdict(list)
    for row in data:
        key = row[column]
        grouped[key].append(row)
    return grouped

if __name__ == "__main__":
    # === Change these paths as needed ===
    input_file = r"C:\Study\SYRACUSE\RA\Assignment 1\period_03\2024_tw_posts_president_scored_anon.csv"
    output_file = 'Output_Python_tw_posts_entire_dataset.txt'

    header, data = load_csv(input_file)
    numeric_cols = detect_numeric_columns(header, data)

    with open(output_file, "w", encoding="utf-8") as f:
        sys.stdout = f

        print(f"\nDetected numeric columns: {sorted(numeric_cols)}")
        print(compute_stats(header, data, numeric_cols))

        grouped = group_by_column(data, "source")
        print("\n===== Grouped by source (first 3 shown) =====")
        for i, (group_key, group_data) in enumerate(grouped.items()):
            if i >= 3:
                break
            print(compute_stats(header, group_data, numeric_cols, group_name=f"source = {group_key}"))

        print("\n===== Script Completed =====")
        sys.stdout = sys.__stdout__
