<a href="https://colab.research.google.com/github/RyuichiSaito1/inflation-reddit-usa/blob/main/src/count_label_ratio.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
from google.colab import drive
drive.mount('/content/drive')

In [None]:
import pandas as pd
import numpy as np
from collections import Counter

def analyze_inflation_dataset(file_path):

    try:
        # Read the TSV file - try different separators
        print("Reading dataset...")

        # First, let's check what the actual separator is
        with open(file_path, 'r', encoding='utf-8') as f:
            first_line = f.readline().strip()
            print(f"First line of file: '{first_line}'")

        # Try different separators
        if '\t' in first_line:
            separator = '\t'
            print("Using tab separator")
        elif ',' in first_line:
            separator = ','
            print("Using comma separator")
        else:
            separator = '\t'  # default
            print("Using default tab separator")

        df = pd.read_csv(file_path, sep=separator, encoding='utf-8')

        # Display basic info about the dataset
        print(f"Dataset shape: {df.shape}")
        print(f"Columns: {list(df.columns)}")
        print("\nFirst few rows:")
        print(df.head())

        # Check if inflation column exists and handle different possible column names
        inflation_col = None
        possible_names = ['inflation', 'Inflation', 'INFLATION', 'label', 'sentiment']

        for col_name in possible_names:
            if col_name in df.columns:
                inflation_col = col_name
                break

        if inflation_col is None:
            print(f"\nError: Could not find inflation/label column.")
            print(f"Available columns: {list(df.columns)}")
            print("Please check the column names in your TSV file.")
            return None, None

        print(f"\nUsing column '{inflation_col}' for analysis...")

        # Calculate total number of records
        total_records = len(df)

        # Analyze inflation column values
        inflation_counts = df[inflation_col].value_counts().sort_index()

        # Calculate ratios
        inflation_ratios = df[inflation_col].value_counts(normalize=True).sort_index()

        # Create comprehensive analysis results
        results = {
            'total_records': total_records,
            'inflation_distribution': {
                'counts': inflation_counts.to_dict(),
                'ratios': {k: round(v, 4) for k, v in inflation_ratios.to_dict().items()}
            }
        }

        # Display results
        print("\n" + "="*50)
        print("DATASET ANALYSIS RESULTS")
        print("="*50)

        print(f"\n1. Total number of records: {total_records}")

        print(f"\n2. Inflation label distribution:")
        print("-" * 30)
        for label in sorted(inflation_counts.index):
            count = inflation_counts[label]
            ratio = inflation_ratios[label]
            percentage = ratio * 100
            print(f"   Label {label}: {count:,} records ({percentage:.2f}%)")

        # Additional statistics
        print(f"\n3. Additional Statistics:")
        print("-" * 30)
        print(f"   Most common label: {inflation_counts.idxmax()} ({inflation_counts.max():,} records)")
        print(f"   Least common label: {inflation_counts.idxmin()} ({inflation_counts.min():,} records)")

        # Check for missing values
        missing_body = df['body'].isnull().sum() if 'body' in df.columns else 0
        missing_inflation = df[inflation_col].isnull().sum()

        if missing_body > 0 or missing_inflation > 0:
            print(f"\n4. Missing Values:")
            print("-" * 30)
            if 'body' in df.columns:
                print(f"   Missing in 'body': {missing_body}")
            print(f"   Missing in '{inflation_col}': {missing_inflation}")

        # Text length statistics (if body column exists)
        if 'body' in df.columns:
            df['text_length'] = df['body'].astype(str).str.len()
            print(f"\n5. Text Length Statistics:")
            print("-" * 30)
            print(f"   Average text length: {df['text_length'].mean():.1f} characters")
            print(f"   Median text length: {df['text_length'].median():.1f} characters")
            print(f"   Min text length: {df['text_length'].min()} characters")
            print(f"   Max text length: {df['text_length'].max()} characters")

        return results, df

    except FileNotFoundError:
        print(f"Error: File not found at {file_path}")
        print("Please check if the file path is correct and the file exists.")
        return None, None
    except Exception as e:
        print(f"Error reading file: {str(e)}")
        return None, None

# Main execution
if __name__ == "__main__":
    # File path
    file_path = "/content/drive/MyDrive/world-inflation/data/reddit/production/test-data-200.csv"

    # Analyze the dataset
    results, df = analyze_inflation_dataset(file_path)

    if results is not None:
        print("\nAnalysis completed successfully!")


    else:
        print("Analysis failed. Please check the file path and try again.")

Reading dataset...
First line of file: 'body,inflation'
Using comma separator
Dataset shape: (200, 2)
Columns: ['body', 'inflation']

First few rows:
                                                body  inflation
0  I am getting ready to buy a mid-sized SUV. 3 r...          1
1  Tangential help, but maybe... Several years ag...          1
2  I don't know how the prices are where you are,...          2
3  You guys are officially my go-to site with thi...          0
4  Why don't more people shop at supermercados (M...          0

Using column 'inflation' for analysis...

DATASET ANALYSIS RESULTS

1. Total number of records: 200

2. Inflation label distribution:
------------------------------
   Label 0: 62 records (31.00%)
   Label 1: 77 records (38.50%)
   Label 2: 61 records (30.50%)

3. Additional Statistics:
------------------------------
   Most common label: 1 (77 records)
   Least common label: 2 (61 records)

5. Text Length Statistics:
------------------------------
   Average t