In [9]:
%matplotlib inline

In [12]:
import pandas as pd
import numpy as np
from sklearn.decomposition import PCA
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error
import matplotlib.pyplot as plt
import os

def split_data(parquet_file, dropna=True):
    # Load the parquet file
    df = pd.read_parquet(parquet_file)
    if dropna:
        df = df.dropna()
    
    timestamps = df['timestamp'].unique()
    total_timestamps = len(timestamps)
    
    # Verify we have 60 timestamps
    assert total_timestamps == 60, f"Expected 60 timestamps, got {total_timestamps}"
    
    # Split timestamps into train/val/test
    train_timestamps = timestamps[:40]  # First 40 timestamps
    val_timestamps = timestamps[40:60]  # Last 20 timestamps
    
    # Split the data and drop timestamp column
    train_data = df[df['timestamp'].isin(train_timestamps)].drop(columns=['timestamp', 'filename'])
    val_data = df[df['timestamp'].isin(val_timestamps)].drop(columns=['timestamp', 'filename'])
    
    return train_data, val_data

def pca_analyze(train_data, val_data, target_column, output_filename):
    # Separate features (X) and target (y)
    X_train = train_data.drop(columns=[target_column])
    y_train = train_data[target_column]
    X_val = val_data.drop(columns=[target_column])
    y_val = val_data[target_column]
    
    # Get number of features
    n_features = X_train.shape[1]
    
    # Track RMSE for different PCA component choices
    rmse_per_component = []
    num_pc = range(1, n_features + 1)
    
    # Try each candidate number of principal components
    for k in num_pc:
        # Step 1: Fit PCA on training data
        pca = PCA(n_components=k)
        X_train_reduced = pca.fit_transform(X_train)
        X_val_reduced = pca.transform(X_val)
        
        # Step 2: Fit linear regression on reduced training data
        regressor = LinearRegression()
        regressor.fit(X_train_reduced, y_train)
        
        # Step 3: Predict on validation data and calculate RMSE
        y_val_pred = regressor.predict(X_val_reduced)
        rmse = np.sqrt(mean_squared_error(y_val, y_val_pred))
        rmse_per_component.append(rmse)
    
    # Find optimal number of components
    opt_idx = np.argmin(rmse_per_component)
    opt_k = num_pc[opt_idx]
    
    # Plot RMSE vs number of components
    plt.figure(figsize=(10, 6))
    plt.plot(num_pc, rmse_per_component, 'bo-')
    plt.xlabel('Number of Principal Components')
    plt.ylabel('RMSE')
    plt.title('RMSE vs Number of Principal Components')
    plt.axvline(x=opt_k, color='r', linestyle='--', 
                label=f'Optimal components: {opt_k}')
    plt.legend()
    plt.grid(True)
    plt.savefig(output_filename)
    plt.close()
    
    return opt_k, rmse_per_component

# Main execution
# Initialize lists to store results
optimal_components = []
min_rmses = []
filenames = []

for i in range(1, 10):
    parquet_file = f"USD_720_PCR/2023-{i:02d}_L60.parquet"
    target_column = "log_return"  # Replace with your target column name

    # Extract filename without path and create output filename
    base_filename = os.path.basename(parquet_file).replace('.parquet', '')
    output_filename = f"USD_720_PCR/{base_filename}_train40_val20.png"

    train_data, val_data = split_data(parquet_file)

    optimal_n_components, rmse_per_component = pca_analyze(
        train_data, val_data, target_column, output_filename
    )

    # Store results in lists
    optimal_components.append(optimal_n_components)
    min_rmses.append(min(rmse_per_component))
    filenames.append(base_filename)

    print(f"\nResults for {base_filename}:")
    print(f"Optimal number of components: {optimal_n_components}")
    print(f"Minimum RMSE achieved: {min(rmse_per_component):.4f}")
    print(f"Plot saved as: {output_filename}")

# Create DataFrame from results
results_df = pd.DataFrame({
    'filename': filenames,
    'optimal_components': optimal_components,
    'min_rmse': min_rmses
})

# Save results to CSV
results_df.to_csv('USD_720_PCR/pca_results.csv', index=False)

# Print summary of all results
print("\nSummary of all results:")
print(results_df)


Results for 2023-01_L60:
Optimal number of components: 29
Minimum RMSE achieved: 0.0040
Plot saved as: USD_720_PCR/2023-01_L60_train40_val20.png

Results for 2023-02_L60:
Optimal number of components: 28
Minimum RMSE achieved: 0.0025
Plot saved as: USD_720_PCR/2023-02_L60_train40_val20.png

Results for 2023-03_L60:
Optimal number of components: 13
Minimum RMSE achieved: 0.0303
Plot saved as: USD_720_PCR/2023-03_L60_train40_val20.png

Results for 2023-04_L60:
Optimal number of components: 29
Minimum RMSE achieved: 0.0014
Plot saved as: USD_720_PCR/2023-04_L60_train40_val20.png

Results for 2023-05_L60:
Optimal number of components: 25
Minimum RMSE achieved: 0.0212
Plot saved as: USD_720_PCR/2023-05_L60_train40_val20.png

Results for 2023-06_L60:
Optimal number of components: 29
Minimum RMSE achieved: 0.0122
Plot saved as: USD_720_PCR/2023-06_L60_train40_val20.png

Results for 2023-07_L60:
Optimal number of components: 23
Minimum RMSE achieved: 0.0205
Plot saved as: USD_720_PCR/2023-07_

FileNotFoundError: [Errno 2] No such file or directory: 'USD_720_PCR/2023-10_L60.parquet'

In [16]:
import pandas as pd
import numpy as np
from sklearn.decomposition import PCA
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error
import matplotlib.pyplot as plt
import os


def pca_analyze(train_data, val_data, target_column, output_filename):
    # Separate features (X) and target (y)
    X_train = train_data.drop(columns=[target_column])
    y_train = train_data[target_column]
    X_val = val_data.drop(columns=[target_column])
    y_val = val_data[target_column]
    
    # Get number of features
    n_features = X_train.shape[1]
    
    # Track RMSE for different PCA component choices
    rmse_per_component = []
    num_pc = range(1, n_features + 1)
    
    # Try each candidate number of principal components
    for k in num_pc:
        # Step 1: Fit PCA on training data
        pca = PCA(n_components=k)
        X_train_reduced = pca.fit_transform(X_train)
        X_val_reduced = pca.transform(X_val)
        
        # Step 2: Fit linear regression on reduced training data
        regressor = LinearRegression()
        regressor.fit(X_train_reduced, y_train)
        
        # Step 3: Predict on validation data and calculate RMSE
        y_val_pred = regressor.predict(X_val_reduced)
        rmse = np.sqrt(mean_squared_error(y_val, y_val_pred))
        rmse_per_component.append(rmse)
    
    # Find optimal number of components
    opt_idx = np.argmin(rmse_per_component)
    opt_k = num_pc[opt_idx]
    
    # Plot RMSE vs number of components
    plt.figure(figsize=(10, 6))
    plt.plot(num_pc, rmse_per_component, 'bo-')
    plt.xlabel('Number of Principal Components')
    plt.ylabel('RMSE')
    plt.title('RMSE vs Number of Principal Components')
    plt.axvline(x=opt_k, color='r', linestyle='--', 
                label=f'Optimal components: {opt_k}')
    plt.legend()
    plt.grid(True)
    plt.savefig(output_filename)
    plt.close()
    
    # Get final predictions using optimal number of components
    pca = PCA(n_components=opt_k)
    X_train_reduced = pca.fit_transform(X_train)
    X_val_reduced = pca.transform(X_val)
    regressor = LinearRegression()
    regressor.fit(X_train_reduced, y_train)
    y_val_pred = regressor.predict(X_val_reduced)
    
    return opt_k, rmse_per_component, y_val, y_val_pred

def plot_squared_errors(y_true, y_pred, filename):
    # Calculate squared errors
    squared_errors = (y_true - y_pred) ** 2
    
    # Create plot
    plt.figure(figsize=(12, 6))
    plt.plot(range(len(squared_errors)), squared_errors, 'ro-')
    plt.xlabel('Sample Index')
    plt.ylabel('Squared Error')
    plt.title('Squared Errors for Validation Samples')
    plt.grid(True)
    
    # Save plot
    output_filename = f"USD_720_PCR/{filename}_squared_errors.png"
    plt.savefig(output_filename)
    plt.close()
    return output_filename

# Main execution
# Initialize lists to store results
optimal_components = []
min_rmses = []
filenames = []

for i in range(1, 10):
    parquet_file = f"USD_720_PCR/2023-{i:02d}_L60.parquet"
    target_column = "log_return"  # Replace with your target column name

    # Extract filename without path and create output filename
    base_filename = os.path.basename(parquet_file).replace('.parquet', '')
    output_filename = f"USD_720_PCR/{base_filename}_train40_val20.png"

    train_data, val_data = split_data(parquet_file)

    optimal_n_components, rmse_per_component, y_val, y_val_pred = pca_analyze(
        train_data, val_data, target_column, output_filename
    )
    
    # Plot squared errors
    squared_errors_filename = plot_squared_errors(y_val, y_val_pred, base_filename)

    # Store results in lists
    optimal_components.append(optimal_n_components)
    min_rmses.append(min(rmse_per_component))
    filenames.append(base_filename)

    print(f"\nResults for {base_filename}:")
    print(f"Optimal number of components: {optimal_n_components}")
    print(f"Minimum RMSE achieved: {min(rmse_per_component):.4f}")
    print(f"RMSE plot saved as: {output_filename}")
    print(f"Squared errors plot saved as: {squared_errors_filename}")




Results for 2023-01_L60:
Optimal number of components: 29
Minimum RMSE achieved: 0.0040
RMSE plot saved as: USD_720_PCR/2023-01_L60_train40_val20.png
Squared errors plot saved as: USD_720_PCR/2023-01_L60_squared_errors.png

Results for 2023-02_L60:
Optimal number of components: 28
Minimum RMSE achieved: 0.0025
RMSE plot saved as: USD_720_PCR/2023-02_L60_train40_val20.png
Squared errors plot saved as: USD_720_PCR/2023-02_L60_squared_errors.png

Results for 2023-03_L60:
Optimal number of components: 13
Minimum RMSE achieved: 0.0303
RMSE plot saved as: USD_720_PCR/2023-03_L60_train40_val20.png
Squared errors plot saved as: USD_720_PCR/2023-03_L60_squared_errors.png

Results for 2023-04_L60:
Optimal number of components: 29
Minimum RMSE achieved: 0.0014
RMSE plot saved as: USD_720_PCR/2023-04_L60_train40_val20.png
Squared errors plot saved as: USD_720_PCR/2023-04_L60_squared_errors.png

Results for 2023-05_L60:
Optimal number of components: 25
Minimum RMSE achieved: 0.0212
RMSE plot saved

In [17]:
def split_data(parquet_file, dropna=True):
    # Load the parquet file
    df = pd.read_parquet(parquet_file)
    if dropna:
        df = df.dropna()
    
    timestamps = df['timestamp'].unique()
    total_timestamps = len(timestamps)
    
    # Verify we have 60 timestamps
    assert total_timestamps == 60, f"Expected 60 timestamps, got {total_timestamps}"
    
    # Split timestamps into train/val/test
    train_timestamps = timestamps[:40]  # First 40 timestamps
    val_timestamps = timestamps[40:60]  # Last 20 timestamps
    
    # Split the data but keep filename column for validation data
    train_data = df[df['timestamp'].isin(train_timestamps)].drop(columns=['timestamp'])
    val_data = df[df['timestamp'].isin(val_timestamps)].drop(columns=['timestamp'])
    
    return train_data, val_data


def analyze_top_mse_samples(y_true, y_pred, val_data, n_top=10):
    # Calculate squared errors
    squared_errors = (y_true - y_pred) ** 2
    
    # Create a DataFrame with the results
    results_df = pd.DataFrame({
        'squared_error': squared_errors,
        'filename': val_data['filename']
    })
    
    # Sort by squared error in descending order
    results_df = results_df.sort_values('squared_error', ascending=False)
    
    # Get top N samples
    top_samples = results_df.head(n_top)
    
    # Analyze dataset distribution
    dataset_counts = top_samples['filename'].value_counts()
    
    # Create a bar plot of dataset distribution
    plt.figure(figsize=(12, 6))
    dataset_counts.plot(kind='bar')
    plt.title(f'Distribution of Datasets in Top {n_top} MSE Samples')
    plt.xlabel('Dataset')
    plt.ylabel('Count')
    plt.xticks(rotation=45)
    plt.tight_layout()
    
    return top_samples, dataset_counts


def pca_analyze(train_data, val_data, target_column, output_filename):
    # Separate features (X) and target (y)
    X_train = train_data.drop(columns=[target_column, 'filename'])
    y_train = train_data[target_column]
    X_val = val_data.drop(columns=[target_column, 'filename'])
    y_val = val_data[target_column]
    
    # Get number of features
    n_features = X_train.shape[1]
    
    # Track RMSE for different PCA component choices
    rmse_per_component = []
    num_pc = range(1, n_features + 1)
    
    # Try each candidate number of principal components
    for k in num_pc:
        # Step 1: Fit PCA on training data
        pca = PCA(n_components=k)
        X_train_reduced = pca.fit_transform(X_train)
        X_val_reduced = pca.transform(X_val)
        
        # Step 2: Fit linear regression on reduced training data
        regressor = LinearRegression()
        regressor.fit(X_train_reduced, y_train)
        
        # Step 3: Predict on validation data and calculate RMSE
        y_val_pred = regressor.predict(X_val_reduced)
        rmse = np.sqrt(mean_squared_error(y_val, y_val_pred))
        rmse_per_component.append(rmse)
    
    # Find optimal number of components
    opt_idx = np.argmin(rmse_per_component)
    opt_k = num_pc[opt_idx]
    
    # Plot RMSE vs number of components
    plt.figure(figsize=(10, 6))
    plt.plot(num_pc, rmse_per_component, 'bo-')
    plt.xlabel('Number of Principal Components')
    plt.ylabel('RMSE')
    plt.title('RMSE vs Number of Principal Components')
    plt.axvline(x=opt_k, color='r', linestyle='--', 
                label=f'Optimal components: {opt_k}')
    plt.legend()
    plt.grid(True)
    plt.savefig(output_filename)
    plt.close()
    
    # Get final predictions using optimal number of components
    pca = PCA(n_components=opt_k)
    X_train_reduced = pca.fit_transform(X_train)
    X_val_reduced = pca.transform(X_val)
    regressor = LinearRegression()
    regressor.fit(X_train_reduced, y_train)
    y_val_pred = regressor.predict(X_val_reduced)
    
    return opt_k, rmse_per_component, y_val, y_val_pred


for i in range(1, 10):
    parquet_file = f"USD_720_PCR/2023-{i:02d}_L60.parquet"
    target_column = "log_return"

    base_filename = os.path.basename(parquet_file).replace('.parquet', '')
    output_filename = f"USD_720_PCR/{base_filename}_train40_val20.png"

    train_data, val_data = split_data(parquet_file)

    optimal_n_components, rmse_per_component, y_val, y_val_pred = pca_analyze(
        train_data, val_data, target_column, output_filename
    )
    
    # Analyze top MSE samples
    top_samples, dataset_counts = analyze_top_mse_samples(y_val, y_val_pred, val_data, n_top=10)
    
    # Save the dataset distribution plot
    plt.savefig(f"USD_720_PCR/{base_filename}_top_mse_distribution.png")
    plt.close()

    print(f"\nResults for {base_filename}:")
    print(f"Optimal number of components: {optimal_n_components}")
    print(f"Minimum RMSE achieved: {min(rmse_per_component):.4f}")
    print(f"RMSE plot saved as: {output_filename}")
    print("\nTop 10 MSE samples dataset distribution:")
    print(dataset_counts)
    print("\nDetailed information for top 10 MSE samples:")
    print(top_samples)


Results for 2023-01_L60:
Optimal number of components: 29
Minimum RMSE achieved: 0.0040
RMSE plot saved as: USD_720_PCR/2023-01_L60_train40_val20.png

Top 10 MSE samples dataset distribution:
filename
PSTAKEUSD_720.parquet    2
ADXUSD_720.parquet       2
TUSD_720.parquet         1
KEEPUSD_720.parquet      1
BNCUSD_720.parquet       1
STGUSD_720.parquet       1
MXCUSD_720.parquet       1
TBTCUSD_720.parquet      1
Name: count, dtype: int64

Detailed information for top 10 MSE samples:
       squared_error               filename
8985        0.013868       TUSD_720.parquet
7005        0.011040    KEEPUSD_720.parquet
12760       0.005479  PSTAKEUSD_720.parquet
6225        0.001377     BNCUSD_720.parquet
8866        0.001143     ADXUSD_720.parquet
8867        0.001021     ADXUSD_720.parquet
12761       0.000823  PSTAKEUSD_720.parquet
3702        0.000660     STGUSD_720.parquet
12527       0.000644     MXCUSD_720.parquet
3405        0.000630    TBTCUSD_720.parquet

Results for 2023-02_L60:


In [19]:
def analyze_missing_entries(df, timestamps):
    # Filter data for the given timestamps
    df_filtered = df[df['timestamp'].isin(timestamps)]
    
    # Count entries per file
    entries_by_file = df_filtered.groupby('filename').size().reset_index(name='entry_count')
    
    # Calculate missing entries (expected 40 entries per file in training)
    expected_entries = 40
    entries_by_file['missing_count'] = expected_entries - entries_by_file['entry_count']
    entries_by_file['missing_percentage'] = (entries_by_file['missing_count'] / expected_entries * 100).round(2)
    
    return entries_by_file


def split_data(parquet_file, dropna=True):
    # Load the parquet file
    df = pd.read_parquet(parquet_file)
    
    timestamps = df['timestamp'].unique()
    total_timestamps = len(timestamps)
    
    # Verify we have 60 timestamps
    assert total_timestamps == 60, f"Expected 60 timestamps, got {total_timestamps}"
    
    # Split timestamps into train/val/test
    train_timestamps = timestamps[:40]  # First 40 timestamps
    val_timestamps = timestamps[40:60]  # Last 20 timestamps
    
    # Analyze missing entries in training data
    missing_analysis = analyze_missing_entries(df, train_timestamps)
    
    if dropna:
        df = df.dropna()
    
    # Split the data but keep filename column for validation data
    train_data = df[df['timestamp'].isin(train_timestamps)].drop(columns=['timestamp'])
    val_data = df[df['timestamp'].isin(val_timestamps)].drop(columns=['timestamp'])
    
    return train_data, val_data, missing_analysis


for i in range(1, 10):
    parquet_file = f"USD_720_PCR/2023-{i:02d}_L60.parquet"
    target_column = "log_return"

    base_filename = os.path.basename(parquet_file).replace('.parquet', '')
    output_filename = f"USD_720_PCR/{base_filename}_train40_val20.png"

    train_data, val_data, missing_analysis = split_data(parquet_file)

    print(f"\nMissing entries analysis for {base_filename}:")
    print(missing_analysis.sort_values('missing_count', ascending=False))
    print("\nSummary statistics for missing entries:")
    print(f"Total files with missing entries: {len(missing_analysis[missing_analysis['missing_count'] > 0])}")
    print(f"Average missing percentage: {missing_analysis['missing_percentage'].mean():.2f}%")
    print(f"Maximum missing percentage: {missing_analysis['missing_percentage'].max():.2f}%")
    print(f"Total missing entries: {missing_analysis['missing_count'].sum()}")


Missing entries analysis for 2023-01_L60:
                 filename  entry_count  missing_count  missing_percentage
0    1INCHUSD_720.parquet           40              0                 0.0
148   PONDUSD_720.parquet           40              0                 0.0
137    OMGUSD_720.parquet           40              0                 0.0
138   ORCAUSD_720.parquet           40              0                 0.0
139    OXTUSD_720.parquet           40              0                 0.0
..                    ...          ...            ...                 ...
75     FTMUSD_720.parquet           40              0                 0.0
76     FXSUSD_720.parquet           40              0                 0.0
77    GALAUSD_720.parquet           40              0                 0.0
78     GALUSD_720.parquet           40              0                 0.0
215    ZRXUSD_720.parquet           40              0                 0.0

[216 rows x 4 columns]

Summary statistics for missing entries:
Tota

In [21]:
def analyze_missing_entries(df, timestamps):
    # Filter data for the given timestamps
    df_filtered = df[df['timestamp'].isin(timestamps)]
    
    # Count entries per file
    entries_by_file = df_filtered.groupby('filename').size().reset_index(name='entry_count')
    
    # Calculate missing entries (expected 40 entries per file in training)
    expected_entries = 40
    entries_by_file['missing_count'] = expected_entries - entries_by_file['entry_count']
    entries_by_file['missing_percentage'] = (entries_by_file['missing_count'] / expected_entries * 100).round(2)
    
    # Create dictionary of filename:missing_percentage
    missing_dict = dict(zip(entries_by_file['filename'], entries_by_file['missing_percentage']))
    
    return entries_by_file, missing_dict

def analyze_top_mse_samples(y_true, y_pred, val_data, missing_dict, n_top=10):
    # Calculate squared errors
    squared_errors = (y_true - y_pred) ** 2
    
    # Create a DataFrame with the results
    results_df = pd.DataFrame({
        'squared_error': squared_errors,
        'filename': val_data['filename']
    })
    
    # Sort by squared error in descending order
    results_df = results_df.sort_values('squared_error', ascending=False)
    
    # Get top N samples
    top_samples = results_df.head(n_top)
    
    # Add missing percentage to the results
    top_samples['missing_percentage'] = top_samples['filename'].map(missing_dict)
    
    # Analyze dataset distribution
    dataset_counts = top_samples['filename'].value_counts()
    
    # Create a bar plot of dataset distribution
    plt.figure(figsize=(12, 6))
    dataset_counts.plot(kind='bar')
    plt.title(f'Distribution of Datasets in Top {n_top} MSE Samples')
    plt.xlabel('Dataset')
    plt.ylabel('Count')
    plt.xticks(rotation=45)
    plt.tight_layout()
    
    return top_samples, dataset_counts

def split_data(parquet_file, dropna=True):
    # Load the parquet file
    df = pd.read_parquet(parquet_file)
    
    timestamps = df['timestamp'].unique()
    total_timestamps = len(timestamps)
    
    # Verify we have 60 timestamps
    assert total_timestamps == 60, f"Expected 60 timestamps, got {total_timestamps}"
    
    # Split timestamps into train/val/test
    train_timestamps = timestamps[:40]  # First 40 timestamps
    val_timestamps = timestamps[40:60]  # Last 20 timestamps
    
    # Analyze missing entries in training data
    missing_analysis, missing_dict = analyze_missing_entries(df, train_timestamps)
    
    if dropna:
        df = df.dropna()
    
    # Split the data but keep filename column for validation data
    train_data = df[df['timestamp'].isin(train_timestamps)].drop(columns=['timestamp'])
    val_data = df[df['timestamp'].isin(val_timestamps)].drop(columns=['timestamp'])
    
    return train_data, val_data, missing_analysis, missing_dict

# Main execution
# Initialize lists to store results
optimal_components = []
min_rmses = []
filenames = []

for i in range(1, 10):
    parquet_file = f"USD_720_PCR/2023-{i:02d}_L60.parquet"
    target_column = "log_return"

    base_filename = os.path.basename(parquet_file).replace('.parquet', '')
    output_filename = f"USD_720_PCR/{base_filename}_train40_val20.png"

    train_data, val_data, missing_analysis, missing_dict = split_data(parquet_file)

    print(f"\nMissing entries analysis for {base_filename}:")
    print(missing_analysis.sort_values('missing_count', ascending=False))
    print("\nSummary statistics for missing entries:")
    print(f"Total files with missing entries: {len(missing_analysis[missing_analysis['missing_count'] > 0])}")
    print(f"Average missing percentage: {missing_analysis['missing_percentage'].mean():.2f}%")
    print(f"Maximum missing percentage: {missing_analysis['missing_percentage'].max():.2f}%")
    print(f"Total missing entries: {missing_analysis['missing_count'].sum()}")

    optimal_n_components, rmse_per_component, y_val, y_val_pred = pca_analyze(
        train_data, val_data, target_column, output_filename
    )
    
    # Analyze top MSE samples
    top_samples, dataset_counts = analyze_top_mse_samples(y_val, y_val_pred, val_data, missing_dict, n_top=10)
    
    # Save the dataset distribution plot
    plt.savefig(f"USD_720_PCR/{base_filename}_top_mse_distribution.png")
    plt.close()

    # Store results in lists
    optimal_components.append(optimal_n_components)
    min_rmses.append(min(rmse_per_component))
    filenames.append(base_filename)

    print(f"\nResults for {base_filename}:")
    print(f"Optimal number of components: {optimal_n_components}")
    print(f"Minimum RMSE achieved: {min(rmse_per_component):.4f}")
    print(f"RMSE plot saved as: {output_filename}")
    print("\nTop 10 MSE samples with missing percentages:")
    print(top_samples[['filename', 'squared_error', 'missing_percentage']].sort_values('squared_error', ascending=False))
    print("\nDataset distribution in top 10 MSE samples:")
    print(dataset_counts)




Missing entries analysis for 2023-01_L60:
                 filename  entry_count  missing_count  missing_percentage
0    1INCHUSD_720.parquet           40              0                 0.0
148   PONDUSD_720.parquet           40              0                 0.0
137    OMGUSD_720.parquet           40              0                 0.0
138   ORCAUSD_720.parquet           40              0                 0.0
139    OXTUSD_720.parquet           40              0                 0.0
..                    ...          ...            ...                 ...
75     FTMUSD_720.parquet           40              0                 0.0
76     FXSUSD_720.parquet           40              0                 0.0
77    GALAUSD_720.parquet           40              0                 0.0
78     GALUSD_720.parquet           40              0                 0.0
215    ZRXUSD_720.parquet           40              0                 0.0

[216 rows x 4 columns]

Summary statistics for missing entries:
Tota

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  top_samples['missing_percentage'] = top_samples['filename'].map(missing_dict)



Results for 2023-01_L60:
Optimal number of components: 29
Minimum RMSE achieved: 0.0040
RMSE plot saved as: USD_720_PCR/2023-01_L60_train40_val20.png

Top 10 MSE samples with missing percentages:
                    filename  squared_error  missing_percentage
8985        TUSD_720.parquet       0.013868                 0.0
7005     KEEPUSD_720.parquet       0.011040                 0.0
12760  PSTAKEUSD_720.parquet       0.005479                 0.0
6225      BNCUSD_720.parquet       0.001377                 0.0
8866      ADXUSD_720.parquet       0.001143                 0.0
8867      ADXUSD_720.parquet       0.001021                 0.0
12761  PSTAKEUSD_720.parquet       0.000823                 0.0
3702      STGUSD_720.parquet       0.000660                 0.0
12527     MXCUSD_720.parquet       0.000644                 0.0
3405     TBTCUSD_720.parquet       0.000630                 0.0

Dataset distribution in top 10 MSE samples:
filename
PSTAKEUSD_720.parquet    2
ADXUSD_720.parquet

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  top_samples['missing_percentage'] = top_samples['filename'].map(missing_dict)



Results for 2023-02_L60:
Optimal number of components: 28
Minimum RMSE achieved: 0.0025
RMSE plot saved as: USD_720_PCR/2023-02_L60_train40_val20.png

Top 10 MSE samples with missing percentages:
                  filename  squared_error  missing_percentage
9284    KEYUSD_720.parquet       0.000503                 0.0
881    ANKRUSD_720.parquet       0.000455                 0.0
9281    KEYUSD_720.parquet       0.000442                 0.0
2443    TRUUSD_720.parquet       0.000256                 0.0
5867    SGBUSD_720.parquet       0.000244                 0.0
11337  TEERUSD_720.parquet       0.000210                 0.0
5691     MVUSD_720.parquet       0.000190                 0.0
882    ANKRUSD_720.parquet       0.000179                 0.0
2442    TRUUSD_720.parquet       0.000175                 0.0
8742    SYNUSD_720.parquet       0.000173                 0.0

Dataset distribution in top 10 MSE samples:
filename
KEYUSD_720.parquet     2
ANKRUSD_720.parquet    2
TRUUSD_720.parque

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  top_samples['missing_percentage'] = top_samples['filename'].map(missing_dict)



Results for 2023-03_L60:
Optimal number of components: 13
Minimum RMSE achieved: 0.0303
RMSE plot saved as: USD_720_PCR/2023-03_L60_train40_val20.png

Top 10 MSE samples with missing percentages:
                  filename  squared_error  missing_percentage
7014   ROOKUSD_720.parquet       0.096021                 0.0
7013   ROOKUSD_720.parquet       0.061986                 0.0
13009   EULUSD_720.parquet       0.043571                 0.0
5680     MVUSD_720.parquet       0.034943                 0.0
2153    SBRUSD_720.parquet       0.029372                 0.0
7002   ROOKUSD_720.parquet       0.028781                 0.0
8625    XCNUSD_720.parquet       0.026704                 0.0
4133    RADUSD_720.parquet       0.023953                 0.0
3466    OMGUSD_720.parquet       0.020921                 0.0
7011   ROOKUSD_720.parquet       0.020033                 0.0

Dataset distribution in top 10 MSE samples:
filename
ROOKUSD_720.parquet    4
EULUSD_720.parquet     1
MVUSD_720.parquet

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  top_samples['missing_percentage'] = top_samples['filename'].map(missing_dict)



Results for 2023-04_L60:
Optimal number of components: 29
Minimum RMSE achieved: 0.0014
RMSE plot saved as: USD_720_PCR/2023-04_L60_train40_val20.png

Top 10 MSE samples with missing percentages:
                  filename  squared_error  missing_percentage
3226    CSMUSD_720.parquet       0.000554                 0.0
12826   MXCUSD_720.parquet       0.000531                 0.0
3228    CSMUSD_720.parquet       0.000428                 0.0
11510  TEERUSD_720.parquet       0.000234                 0.0
3220    CSMUSD_720.parquet       0.000221                 0.0
3229    CSMUSD_720.parquet       0.000218                 0.0
11512  TEERUSD_720.parquet       0.000161                 0.0
3290   INTRUSD_720.parquet       0.000124                 0.0
11519  TEERUSD_720.parquet       0.000114                 0.0
3231    CSMUSD_720.parquet       0.000101                 0.0

Dataset distribution in top 10 MSE samples:
filename
CSMUSD_720.parquet     5
TEERUSD_720.parquet    3
MXCUSD_720.parque

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  top_samples['missing_percentage'] = top_samples['filename'].map(missing_dict)



Results for 2023-05_L60:
Optimal number of components: 25
Minimum RMSE achieved: 0.0212
RMSE plot saved as: USD_720_PCR/2023-05_L60_train40_val20.png

Top 10 MSE samples with missing percentages:
                   filename  squared_error  missing_percentage
4664    SAMOUSD_720.parquet       0.149025                 0.0
4662    SAMOUSD_720.parquet       0.119881                 0.0
4669    SAMOUSD_720.parquet       0.049189                 0.0
5081    ARPAUSD_720.parquet       0.041274                 0.0
8801     XCNUSD_720.parquet       0.024457                 0.0
6714     KARUSD_720.parquet       0.017104                 0.0
6889   MULTIUSD_720.parquet       0.017011                 0.0
10542    NMRUSD_720.parquet       0.016632                 0.0
13304    MIRUSD_720.parquet       0.015086                 0.0
4668    SAMOUSD_720.parquet       0.011528                 0.0

Dataset distribution in top 10 MSE samples:
filename
SAMOUSD_720.parquet     4
ARPAUSD_720.parquet     1
XCNU

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  top_samples['missing_percentage'] = top_samples['filename'].map(missing_dict)



Results for 2023-06_L60:
Optimal number of components: 29
Minimum RMSE achieved: 0.0122
RMSE plot saved as: USD_720_PCR/2023-06_L60_train40_val20.png

Top 10 MSE samples with missing percentages:
                 filename  squared_error  missing_percentage
10434  OXYUSD_720.parquet       0.033029                 0.0
10433  OXYUSD_720.parquet       0.032991                 0.0
5445   SRMUSD_720.parquet       0.024437                 0.0
5448   SRMUSD_720.parquet       0.022904                 0.0
5446   SRMUSD_720.parquet       0.019157                 0.0
10432  OXYUSD_720.parquet       0.018655                 0.0
5447   SRMUSD_720.parquet       0.017931                 0.0
5454   SRMUSD_720.parquet       0.013614                 0.0
5453   SRMUSD_720.parquet       0.012676                 0.0
5452   SRMUSD_720.parquet       0.009722                 0.0

Dataset distribution in top 10 MSE samples:
filename
SRMUSD_720.parquet    7
OXYUSD_720.parquet    3
Name: count, dtype: int64

Mis

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  top_samples['missing_percentage'] = top_samples['filename'].map(missing_dict)



Results for 2023-07_L60:
Optimal number of components: 23
Minimum RMSE achieved: 0.0205
RMSE plot saved as: USD_720_PCR/2023-07_L60_train40_val20.png

Top 10 MSE samples with missing percentages:
                 filename  squared_error  missing_percentage
7259  ROOKUSD_720.parquet       0.159032                 0.0
7258  ROOKUSD_720.parquet       0.157833                 0.0
7257  ROOKUSD_720.parquet       0.144280                 0.0
7256  ROOKUSD_720.parquet       0.119942                 0.0
7253  ROOKUSD_720.parquet       0.115300                 0.0
7254  ROOKUSD_720.parquet       0.113390                 0.0
7252  ROOKUSD_720.parquet       0.101108                 0.0
7255  ROOKUSD_720.parquet       0.098746                 0.0
7251  ROOKUSD_720.parquet       0.090251                 0.0
7250  ROOKUSD_720.parquet       0.075919                 0.0

Dataset distribution in top 10 MSE samples:
filename
ROOKUSD_720.parquet    10
Name: count, dtype: int64

Missing entries analysis 

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  top_samples['missing_percentage'] = top_samples['filename'].map(missing_dict)



Results for 2023-08_L60:
Optimal number of components: 29
Minimum RMSE achieved: 0.0063
RMSE plot saved as: USD_720_PCR/2023-08_L60_train40_val20.png

Top 10 MSE samples with missing percentages:
                 filename  squared_error  missing_percentage
7241  ROOKUSD_720.parquet       0.005967                 0.0
7240  ROOKUSD_720.parquet       0.004867                 0.0
7243  ROOKUSD_720.parquet       0.002829                 0.0
7249  ROOKUSD_720.parquet       0.002653                 0.0
7242  ROOKUSD_720.parquet       0.002164                 0.0
7255  ROOKUSD_720.parquet       0.002031                 0.0
7257  ROOKUSD_720.parquet       0.002011                 0.0
7245  ROOKUSD_720.parquet       0.001788                 0.0
7248  ROOKUSD_720.parquet       0.001782                 0.0
2262   BLZUSD_720.parquet       0.001717                 0.0

Dataset distribution in top 10 MSE samples:
filename
ROOKUSD_720.parquet    9
BLZUSD_720.parquet     1
Name: count, dtype: int64

M

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  top_samples['missing_percentage'] = top_samples['filename'].map(missing_dict)
