<a href="https://colab.research.google.com/github/RyuichiSaito1/inflation-reddit-usa/blob/main/src/split_to_training_and_validation_data.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
from google.colab import drive
drive.mount('/content/drive')

In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from collections import Counter
import os

def create_stratified_samples(df, target_sizes, target_col='inflation'):
    """
    Create stratified samples of different sizes while maintaining class distribution
    Each sample is created independently from the original dataset
    """
    # Get the original class distribution
    original_dist = df[target_col].value_counts().sort_index()
    print("Original distribution:")
    total = len(df)
    for class_val, count in original_dist.items():
        print(f"Class {class_val}: {count} ({count/total*100:.1f}%)")

    # Store all samples
    samples = {}

    # Process each target size independently
    for i, size in enumerate(target_sizes):
        print(f"\n{'='*30}")
        print(f"Creating splitting data of size: {size}")
        print(f"{'='*30}")

        if size > len(df):
            print(f"Warning: Requested size {size} exceeds total data {len(df)}")
            size = len(df)

        # Calculate target counts for each class based on original distribution
        target_counts = {}
        for class_val in original_dist.index:
            target_ratio = original_dist[class_val] / total
            target_counts[class_val] = max(1, round(size * target_ratio))

        # Adjust if total exceeds target size
        while sum(target_counts.values()) > size:
            # Reduce the class with the highest count
            max_class = max(target_counts.keys(), key=lambda x: target_counts[x])
            target_counts[max_class] -= 1

        # Ensure total equals target size by adding to classes if needed
        while sum(target_counts.values()) < size:
            # Add to the class with lowest relative representation
            min_class = min(target_counts.keys(), key=lambda x: target_counts[x])
            target_counts[min_class] += 1

        print(f"Target counts: {target_counts}")

        # Sample from each class
        sample_dfs = []

        for class_val, target_count in target_counts.items():
            class_data = df[df[target_col] == class_val]
            available_count = len(class_data)

            if available_count < target_count:
                print(f"Warning: Only {available_count} samples available for class {class_val}, need {target_count}")
                target_count = available_count

            if target_count > 0:
                # Use different random state for each size to ensure variety
                sampled = class_data.sample(n=target_count, random_state=42+i)
                sample_dfs.append(sampled)

        # Combine and shuffle
        if sample_dfs:
            sample_df = pd.concat(sample_dfs, ignore_index=True)
            # Shuffle with different random state for each sample
            sample_df = sample_df.sample(frac=1, random_state=100+i).reset_index(drop=True)
            samples[size] = sample_df

            # Print distribution for this sample
            sample_dist = sample_df[target_col].value_counts().sort_index()
            print(f"Complete dataset size: {len(sample_df)}")
            for class_val, count in sample_dist.items():
                print(f"Class {class_val}: {count} ({count/len(sample_df)*100:.1f}%)")
        else:
            print(f"Could not create sample of size {size}")

    return samples

def split_and_save_datasets(samples, output_dir):
    """
    Split each dataset into training (75%) and validation (25%) sets and save them
    """
    print(f"\n{'='*60}")
    print("SPLITTING DATASETS INTO TRAINING AND VALIDATION SETS")
    print(f"{'='*60}")

    training_datasets = {}
    validation_datasets = {}

    for size, sample_df in samples.items():
        print(f"\n{'='*50}")
        print(f"Processing dataset of size {size}")
        print(f"{'='*50}")

        # Check if we have enough samples for stratified split
        min_class_count = sample_df['inflation'].value_counts().min()
        if min_class_count < 2:
            print(f"Warning: Dataset size {size} has classes with only 1 sample. Cannot perform stratified split.")
            print("Performing random split instead...")
            train_df, val_df = train_test_split(
                sample_df,
                test_size=0.25,
                random_state=42
            )
        else:
            # Perform stratified split to maintain class distribution
            print("Performing stratified split (75% training, 25% validation)...")
            train_df, val_df = train_test_split(
                sample_df,
                test_size=0.25,
                random_state=42,
                stratify=sample_df['inflation']
            )

        # Store the splits
        training_datasets[size] = train_df
        validation_datasets[size] = val_df

        # Print detailed information about the splits
        print(f"\nOriginal dataset: {len(sample_df)} samples")
        original_dist = sample_df['inflation'].value_counts().sort_index()
        for class_val, count in original_dist.items():
            print(f"  Class {class_val}: {count} ({count/len(sample_df)*100:.1f}%)")

        print(f"\nTraining set: {len(train_df)} samples ({len(train_df)/len(sample_df)*100:.1f}%)")
        train_dist = train_df['inflation'].value_counts().sort_index()
        for class_val, count in train_dist.items():
            print(f"  Class {class_val}: {count} ({count/len(train_df)*100:.1f}%)")

        print(f"\nValidation set: {len(val_df)} samples ({len(val_df)/len(sample_df)*100:.1f}%)")
        val_dist = val_df['inflation'].value_counts().sort_index()
        for class_val, count in val_dist.items():
            print(f"  Class {class_val}: {count} ({count/len(val_df)*100:.1f}%)")

        # Save training dataset
        train_filename = f'training_data_{size}.csv'
        train_path = os.path.join(output_dir, train_filename)
        train_df.to_csv(train_path, index=False)
        print(f"\n✓ Saved training data: {train_path}")
        print(f"  Records: {len(train_df)}")

        # Save validation dataset
        val_filename = f'validation_data_{size}.csv'
        val_path = os.path.join(output_dir, val_filename)
        val_df.to_csv(val_path, index=False)
        print(f"✓ Saved validation data: {val_path}")
        print(f"  Records: {len(val_df)}")

        # Verify files were saved successfully
        if os.path.exists(train_path) and os.path.exists(val_path):
            print(f"✓ File verification: Both files for size {size} successfully saved")
        else:
            print(f"✗ Error: Failed to save files for size {size}")

    return training_datasets, validation_datasets

def print_summary(training_datasets, validation_datasets):
    """
    Print a summary of all created datasets
    """
    print(f"\n{'='*60}")
    print("SUMMARY OF CREATED DATASETS")
    print(f"{'='*60}")

    total_train_files = len(training_datasets)
    total_val_files = len(validation_datasets)

    print(f"Total training datasets created: {total_train_files}")
    print(f"Total validation datasets created: {total_val_files}")

    print(f"\n{'Training Datasets:':<25} {'Validation Datasets:'}")
    print("-" * 50)

    for size in sorted(training_datasets.keys()):
        train_count = len(training_datasets[size])
        val_count = len(validation_datasets[size])
        print(f"training_data_{size}.csv ({train_count:>3} samples)   validation_data_{size}.csv ({val_count:>2} samples)")

# Main execution
def main():
    print("="*60)
    print("STRATIFIED TRAINING DATA GENERATOR")
    print("="*60)

    # Read the CSV file
    file_path = '/content/drive/MyDrive/world-inflation/data/reddit/production/training-validation-main-prod.csv'

    try:
        print(f"\nLoading data from: {file_path}")
        df = pd.read_csv(file_path)
        print(f"✓ Successfully loaded {len(df)} records")
        print(f"✓ Columns found: {list(df.columns)}")

        # Verify the data structure
        if 'body' not in df.columns or 'inflation' not in df.columns:
            raise ValueError("Required columns 'body' and 'inflation' not found in the dataset")

        # Check for missing values
        body_missing = df['body'].isnull().sum()
        inflation_missing = df['inflation'].isnull().sum()
        print(f"Missing values - body: {body_missing}, inflation: {inflation_missing}")

        # Remove any rows with missing values
        df_clean = df.dropna(subset=['body', 'inflation'])
        print(f"✓ After removing missing values: {len(df_clean)} records")

        # Define target sizes
        target_sizes = [65, 129, 258, 517, 1033]
        print(f"✓ Target dataset sizes: {target_sizes}")

        # Create stratified samples
        print(f"\n{'='*60}")
        print("CREATING STRATIFIED SAMPLES")
        print(f"{'='*60}")

        samples = create_stratified_samples(df_clean, target_sizes)

        # Set output directory
        output_dir = '/content/drive/MyDrive/world-inflation/data/reddit/production'
        print(f"\nOutput directory: {output_dir}")

        # Split datasets and save them
        training_datasets, validation_datasets = split_and_save_datasets(samples, output_dir)

        # Print summary
        print_summary(training_datasets, validation_datasets)

        print(f"\n{'='*60}")
        print("✓ ALL OPERATIONS COMPLETED SUCCESSFULLY!")
        print(f"{'='*60}")

    except FileNotFoundError:
        print(f"✗ Error: Could not find the file {file_path}")
        print("Please check if the file path is correct and the file exists.")

    except Exception as e:
        print(f"✗ An error occurred: {str(e)}")
        import traceback
        traceback.print_exc()

if __name__ == "__main__":
    main()

# The number of classes is the same for the training and validation data.

In [None]:
import pandas as pd
import numpy as np

# Set random seed for reproducibility
np.random.seed(42)

# Read the CSV file
file_path = '/content/drive/MyDrive/world-inflation/data/reddit/production/training-validation-main-prod.csv'

# Save the split datasets
train_output_path = '/content/drive/MyDrive/world-inflation/data/reddit/production/training_data_954.csv'
val_output_path = '/content/drive/MyDrive/world-inflation/data/reddit/production/validation_data_954.csv'

df = pd.read_csv(file_path)

print(f"Original dataset shape: {df.shape}")
print(f"Original class distribution:\n{df['inflation'].value_counts().sort_index()}")

# Check for missing values
print(f"\nMissing values:\n{df.isnull().sum()}")

# Remove any rows with missing values if they exist
df = df.dropna()

# Find the minimum class count to balance all classes
class_counts = df['inflation'].value_counts()
min_class_count = class_counts.min()
print(f"\nMinimum class count: {min_class_count}")
print(f"Will balance all classes to {min_class_count} samples each")

# Balance the dataset by sampling equal numbers from each class
balanced_dfs = []
for class_val in sorted(df['inflation'].unique()):
    class_df = df[df['inflation'] == class_val].sample(n=min_class_count, random_state=42)
    balanced_dfs.append(class_df)

# Combine balanced classes
balanced_df = pd.concat(balanced_dfs, ignore_index=True)

# Shuffle the balanced dataset
balanced_df = balanced_df.sample(frac=1, random_state=42).reset_index(drop=True)

print(f"\nBalanced dataset shape: {balanced_df.shape}")
print(f"Balanced class distribution:\n{balanced_df['inflation'].value_counts().sort_index()}")

# Now split into training (75%) and validation (25%) while maintaining equal class counts
train_samples_per_class = int(min_class_count * 0.75)
val_samples_per_class = min_class_count - train_samples_per_class

print(f"\nSamples per class in training: {train_samples_per_class}")
print(f"Samples per class in validation: {val_samples_per_class}")

# Split each class separately to ensure equal counts
train_dfs = []
val_dfs = []

for class_val in sorted(balanced_df['inflation'].unique()):
    class_data = balanced_df[balanced_df['inflation'] == class_val].reset_index(drop=True)

    # Split this class's data
    train_class = class_data.iloc[:train_samples_per_class]
    val_class = class_data.iloc[train_samples_per_class:train_samples_per_class + val_samples_per_class]

    train_dfs.append(train_class)
    val_dfs.append(val_class)

# Combine all classes for training and validation
train_df = pd.concat(train_dfs, ignore_index=True)
val_df = pd.concat(val_dfs, ignore_index=True)

# Final shuffle of training and validation sets
train_df = train_df.sample(frac=1, random_state=42).reset_index(drop=True)
val_df = val_df.sample(frac=1, random_state=42).reset_index(drop=True)

# Display final statistics
print(f"\nFinal Training set shape: {train_df.shape}")
print(f"Training class distribution:\n{train_df['inflation'].value_counts().sort_index()}")

print(f"\nFinal Validation set shape: {val_df.shape}")
print(f"Validation class distribution:\n{val_df['inflation'].value_counts().sort_index()}")

# Verify equal class counts
print(f"\nVerification - All classes have equal counts:")
print(f"Training set - Class counts are equal: {len(train_df['inflation'].value_counts().unique()) == 1}")
print(f"Validation set - Class counts are equal: {len(val_df['inflation'].value_counts().unique()) == 1}")

train_df.to_csv(train_output_path, index=False)
val_df.to_csv(val_output_path, index=False)

print(f"\nFiles saved successfully:")
print(f"Training data: {train_output_path}")
print(f"Validation data: {val_output_path}")

# Final verification by reading the saved files
saved_train = pd.read_csv(train_output_path)
saved_val = pd.read_csv(val_output_path)

print(f"\nFinal verification from saved files:")
print(f"Training file - Shape: {saved_train.shape}, Class counts: {saved_train['inflation'].value_counts().sort_index().tolist()}")
print(f"Validation file - Shape: {saved_val.shape}, Class counts: {saved_val['inflation'].value_counts().sort_index().tolist()}")

# The number of classes is the same for the test data.

In [None]:
import pandas as pd
import numpy as np
import os

# Set random seed for reproducibility
np.random.seed(42)

# Read the CSV file
file_path = '/content/drive/MyDrive/world-inflation/data/reddit/production/test.csv'

# Save the balanced and shuffled data as "test-prod-180.csv"
output_path = '/content/drive/MyDrive/world-inflation/data/reddit/production/test-prod-180.csv'

df = pd.read_csv(file_path)

print(f"Original dataset shape: {df.shape}")
print(f"Columns: {df.columns.tolist()}")
print(f"Class distribution:")
print(df['inflation'].value_counts().sort_index())

# Check for missing values
print(f"\nMissing values:")
print(df.isnull().sum())

# Display first few rows
print(f"\nFirst 5 rows:")
print(df.head())

# Balance the classes - ensure each class has the same number of samples
class_counts = df['inflation'].value_counts().sort_index()
print(f"Original class distribution:")
print(class_counts)

# Find the minimum class count to balance all classes
min_count = class_counts.min()
print(f"\nMinimum class count: {min_count}")
print(f"Balancing all classes to {min_count} samples each")

# Sample equal number of instances from each class
balanced_dfs = []
for class_label in sorted(df['inflation'].unique()):
    class_df = df[df['inflation'] == class_label]
    sampled_df = class_df.sample(n=min_count, random_state=42)
    balanced_dfs.append(sampled_df)

# Combine all balanced classes
balanced_df = pd.concat(balanced_dfs, ignore_index=True)

# Shuffle the balanced dataset
shuffled_df = balanced_df.sample(frac=1, random_state=42).reset_index(drop=True)

print(f"\nBalanced and shuffled dataset shape: {shuffled_df.shape}")
print(f"Class distribution after balancing:")
print(shuffled_df['inflation'].value_counts().sort_index())

shuffled_df.to_csv(output_path, index=False)

print(f"\nBalanced data saved to: {output_path}")

# Display final statistics
print(f"\nFinal Statistics:")
print(f"Original total samples: {len(df)}")
print(f"Balanced total samples: {len(shuffled_df)}")
print(f"Samples per class: {min_count}")
print(f"Total classes: {len(df['inflation'].unique())}")
print(f"Data successfully balanced, shuffled, and saved!")

# Split training, validation, and test data.

In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
import os

# Set random seed for reproducibility
np.random.seed(42)

# 1. Read the CSV file
file_path = '/content/drive/MyDrive/world-inflation/data/reddit/production/main-prod.csv'
df = pd.read_csv(file_path)

print(f"Original dataset shape: {df.shape}")
print(f"Class distribution:")
print(df['inflation'].value_counts().sort_index())
print(f"Class proportions:")
original_props = df['inflation'].value_counts(normalize=True).sort_index()
print(original_props)

# 2. Verify the data structure
print(f"\nDataset columns: {list(df.columns)}")
print(f"Total records: {len(df)}")

# 3. Shuffle the dataset
df_shuffled = df.sample(frac=1, random_state=42).reset_index(drop=True)
print(f"\nDataset shuffled successfully")

# 4. Calculate exact number of samples per class for each split
# Test data: 200 records
# Remaining: 1043 records split into training (75%) and validation (25%)
# Training: 1043 * 0.75 = 782.25 ≈ 782 records
# Validation: 1043 * 0.25 = 260.75 ≈ 261 records

test_size = 200
remaining_size = len(df_shuffled) - test_size
train_size = int(remaining_size * 0.75)
val_size = remaining_size - train_size

print(f"\nTarget sizes:")
print(f"Training: {train_size}")
print(f"Validation: {val_size}")
print(f"Test: {test_size}")

# Calculate samples per class for each split to maintain identical ratios
class_counts = df_shuffled['inflation'].value_counts().sort_index()
total_samples = len(df_shuffled)

# For each split, calculate how many samples of each class we need
def calculate_class_samples(target_size, class_counts, total_samples):
    class_samples = {}
    remaining_samples = target_size

    # Calculate proportional samples for each class
    for class_label in sorted(class_counts.index):
        if class_label == sorted(class_counts.index)[-1]:  # Last class gets remaining samples
            class_samples[class_label] = remaining_samples
        else:
            proportion = class_counts[class_label] / total_samples
            samples = int(round(target_size * proportion))
            class_samples[class_label] = samples
            remaining_samples -= samples

    return class_samples

train_class_samples = calculate_class_samples(train_size, class_counts, total_samples)
val_class_samples = calculate_class_samples(val_size, class_counts, total_samples)
test_class_samples = calculate_class_samples(test_size, class_counts, total_samples)

print(f"\nSamples per class:")
print(f"Training: {train_class_samples}")
print(f"Validation: {val_class_samples}")
print(f"Test: {test_class_samples}")

# 5. Create stratified splits with identical ratios
def create_split_with_exact_ratios(df, class_samples, split_name):
    split_data = []

    for class_label, num_samples in class_samples.items():
        class_data = df[df['inflation'] == class_label]
        if len(class_data) < num_samples:
            print(f"Warning: Not enough samples for class {class_label} in {split_name}")
            selected_samples = class_data
        else:
            selected_samples = class_data.sample(n=num_samples, random_state=42)
        split_data.append(selected_samples)

    return pd.concat(split_data, ignore_index=True)

# Create a copy for sampling
df_remaining = df_shuffled.copy()

# Create test split
test_data = create_split_with_exact_ratios(df_remaining, test_class_samples, "test")
# Remove test samples from remaining data
df_remaining = df_remaining.drop(test_data.index).reset_index(drop=True)

# Create training split from remaining data
training_data = create_split_with_exact_ratios(df_remaining, train_class_samples, "training")
# Remove training samples from remaining data
df_remaining = df_remaining.drop(training_data.index).reset_index(drop=True)

# Create validation split from remaining data
validation_data = create_split_with_exact_ratios(df_remaining, val_class_samples, "validation")

# Shuffle each split
training_data = training_data.sample(frac=1, random_state=42).reset_index(drop=True)
validation_data = validation_data.sample(frac=1, random_state=43).reset_index(drop=True)
test_data = test_data.sample(frac=1, random_state=44).reset_index(drop=True)

print(f"\nFinal split sizes:")
print(f"Training data shape: {training_data.shape}")
print(f"Validation data shape: {validation_data.shape}")
print(f"Test data shape: {test_data.shape}")

# Verify class distributions
print(f"\nClass distributions:")
print(f"Training data:")
train_counts = training_data['inflation'].value_counts().sort_index()
train_props = training_data['inflation'].value_counts(normalize=True).sort_index()
print(f"  Counts: {dict(train_counts)}")
print(f"  Proportions: {dict(train_props)}")

print(f"\nValidation data:")
val_counts = validation_data['inflation'].value_counts().sort_index()
val_props = validation_data['inflation'].value_counts(normalize=True).sort_index()
print(f"  Counts: {dict(val_counts)}")
print(f"  Proportions: {dict(val_props)}")

print(f"\nTest data:")
test_counts = test_data['inflation'].value_counts().sort_index()
test_props = test_data['inflation'].value_counts(normalize=True).sort_index()
print(f"  Counts: {dict(test_counts)}")
print(f"  Proportions: {dict(test_props)}")

# 6. Save the datasets
output_dir = '/content/drive/MyDrive/world-inflation/data/reddit/production/'

# Ensure directory exists
os.makedirs(output_dir, exist_ok=True)

# Save files
training_data.to_csv(os.path.join(output_dir, 'training-data-1243.csv'), index=False)
validation_data.to_csv(os.path.join(output_dir, 'validation-data-1243.csv'), index=False)
test_data.to_csv(os.path.join(output_dir, 'test-data-1243.csv'), index=False)

print(f"\nFiles saved successfully:")
print(f"- Training data: {training_data.shape[0]} records")
print(f"- Validation data: {validation_data.shape[0]} records")
print(f"- Test data: {test_data.shape[0]} records")
print(f"- Total: {training_data.shape[0] + validation_data.shape[0] + test_data.shape[0]} records")

# Final verification - check if proportions are identical
print(f"\nProportion verification (should be identical):")
print(f"Training proportions: {[f'{x:.4f}' for x in train_props.values]}")
print(f"Validation proportions: {[f'{x:.4f}' for x in val_props.values]}")
print(f"Test proportions: {[f'{x:.4f}' for x in test_props.values]}")

# Check maximum difference between proportions
max_diff_train_val = max(abs(train_props - val_props))
max_diff_train_test = max(abs(train_props - test_props))
max_diff_val_test = max(abs(val_props - test_props))

print(f"\nMaximum proportion differences:")
print(f"Training vs Validation: {max_diff_train_val:.6f}")
print(f"Training vs Test: {max_diff_train_test:.6f}")
print(f"Validation vs Test: {max_diff_val_test:.6f}")