<a href="https://colab.research.google.com/github/RyuichiSaito1/inflation-reddit-usa/blob/main/notebooks/create_training_and_validation_data_for_openai.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
from google.colab import drive
drive.mount('/content/drive')

In [None]:
import pandas as pd
import numpy as np
import os
from collections import Counter

# ☆
# File paths
input_file = "/content/drive/MyDrive/world-inflation/data/reddit/production/training-validation-main-prod-65.csv"
train_output = "/content/drive/MyDrive/world-inflation/data/reddit/production/training-main-prod-65.tsv"
validation_output = "/content/drive/MyDrive/world-inflation/data/reddit/production/validation-main-prod-65.tsv"

# Read the input TSV file
print("Reading input file...")
df = pd.read_csv(input_file, sep=',')

# Extract only the 'body' and 'inflation' columns
df = df[['body', 'inflation']]

# Function to create balanced datasets
def create_balanced_datasets(data, train_ratio=0.75):

    # Separate data by class
    class_0 = data[data['inflation'] == 0]
    class_1 = data[data['inflation'] == 1]
    class_2 = data[data['inflation'] == 2]

    # Find the minimum class count
    min_class_count = min(len(class_0), len(class_1), len(class_2))

    # Sample the same number of instances from each class
    class_0 = class_0.sample(min_class_count, random_state=42)
    class_1 = class_1.sample(min_class_count, random_state=42)
    class_2 = class_2.sample(min_class_count, random_state=42)

    # Calculate the number of samples for training and validation
    train_samples = int(min_class_count * train_ratio)
    val_samples = min_class_count - train_samples

    # Create training sets
    train_0 = class_0.iloc[:train_samples]
    train_1 = class_1.iloc[:train_samples]
    train_2 = class_2.iloc[:train_samples]

    # Create validation sets
    val_0 = class_0.iloc[train_samples:min_class_count]
    val_1 = class_1.iloc[train_samples:min_class_count]
    val_2 = class_2.iloc[train_samples:min_class_count]

    # Combine training and validation sets
    train_df = pd.concat([train_0, train_1, train_2])
    val_df = pd.concat([val_0, val_1, val_2])

    # Shuffle the datasets
    train_df = train_df.sample(frac=1, random_state=42).reset_index(drop=True)
    val_df = val_df.sample(frac=1, random_state=42).reset_index(drop=True)

    return train_df, val_df

# Create balanced training and validation datasets
print("Creating balanced datasets with equal class distribution...")
train_df, val_df = create_balanced_datasets(df)

# Print the original distribution
print("\nOriginal dataset distribution:")
original_counts = df['inflation'].value_counts().sort_index()
print(original_counts)
print(f"Total records: {len(df)}")

# Print the distribution of inflation values in the training dataset
print("\nTraining dataset distribution:")
train_counts = train_df['inflation'].value_counts().sort_index()
print(train_counts)
print(f"Total training records: {len(train_df)}")
print(f"Class distribution: {dict(Counter(train_df['inflation']))}")

# Print the distribution of inflation values in the validation dataset
print("\nValidation dataset distribution:")
val_counts = val_df['inflation'].value_counts().sort_index()
print(val_counts)
print(f"Total validation records: {len(val_df)}")
print(f"Class distribution: {dict(Counter(val_df['inflation']))}")

# Create output directories if they don't exist
os.makedirs(os.path.dirname(train_output), exist_ok=True)
os.makedirs(os.path.dirname(validation_output), exist_ok=True)

# Save the training and validation datasets
print("\nSaving files...")
train_df.to_csv(train_output, sep='\t', index=False)
val_df.to_csv(validation_output, sep='\t', index=False)

print(f"Training data saved to: {train_output}")
print(f"Validation data saved to: {validation_output}")
print("Done!")