# In this notebook we will check if the models csv fiels that we have are in a good quality

In [None]:
import pandas as pd
from pathlib import Path

# Directory containing the prediction CSVs
base_path = Path(r"C:\Users\ADMIN\Desktop\Coding_projects\stock_market_prediction\Stock-Market-Prediction\Final_runs_csv")

# List of file names
file_names = [
    "catboost_trial57_predictions",
    "cnn_predictions",
    "gru_trial28_f05_preds",
    "lgbm_predictions_formatted_backup",
    "logisticreg_validation_predictions",
    "lstm_test_predictions",
    "RandomForest_predictions_custom_high_precision",
    "TCN_Trial_36_predictions",
    "xgboost_predictions_fixed",
    "cnn_lstm_val_preds_20250614_142329"
]

# Store columns and start dates
column_sets = {}
start_dates = {}

for fname in file_names:
    fpath = base_path / f"{fname}.csv"
    try:
        df = pd.read_csv(fpath)
        column_sets[fname] = list(df.columns)
        if 'timestamp' in df.columns:
            start_dates[fname] = pd.to_datetime(df['timestamp']).min()
        else:
            start_dates[fname] = "❌ No 'timestamp' column"
    except Exception as e:
        column_sets[fname] = f"❌ Error reading file: {e}"
        start_dates[fname] = "❌"

# Check column consistency
print("\n🧪 COLUMN CHECK:")
first_cols = list(column_sets.values())[0]
consistent = True
for fname, cols in column_sets.items():
    if cols != first_cols:
        consistent = False
        print(f"❌ Columns mismatch in: {fname}")
    else:
        print(f"✅ Columns match: {fname}")

if consistent:
    print("\n✅ All files have the same columns.\n")
else:
    print("\n⚠️ Not all files have matching columns.\n")

# Print start dates
print("📅 START DATES FROM 'timestamp':")
for fname, start in start_dates.items():
    print(f"{fname}: {start}")



🧪 COLUMN CHECK:
✅ Columns match: catboost_trial57_predictions
✅ Columns match: cnn_predictions
✅ Columns match: gru_trial28_predictions
✅ Columns match: lgbm_predictions_formatted_backup
✅ Columns match: logisticreg_validation_predictions
✅ Columns match: lstm_test_predictions
✅ Columns match: RandomForest_predictions_custom_high_precision
✅ Columns match: TCN_Trial_36_predictions
✅ Columns match: xgboost_predictions_fixed

✅ All files have the same columns.

📅 START DATES FROM 'timestamp':
catboost_trial57_predictions: 2023-10-16 16:00:00
cnn_predictions: 2023-10-20 16:00:00
gru_trial28_predictions: 2024-03-15 12:00:00
lgbm_predictions_formatted_backup: 2023-08-08 00:00:00
logisticreg_validation_predictions: 2023-10-16 16:00:00
lstm_test_predictions: 2023-10-20 12:00:00
RandomForest_predictions_custom_high_precision: 2023-10-16 16:00:00
TCN_Trial_36_predictions: 2023-10-18 16:00:00
xgboost_predictions_fixed: 2023-05-07 12:00:00


  start_dates[fname] = pd.to_datetime(df['timestamp']).min()
  start_dates[fname] = pd.to_datetime(df['timestamp']).min()
  start_dates[fname] = pd.to_datetime(df['timestamp']).min()


In [16]:
import pandas as pd
from pathlib import Path
import numpy as np
from datetime import datetime

# Directory containing the prediction CSVs
base_path = Path(r"C:\Users\ADMIN\Desktop\Coding_projects\stock_market_prediction\Stock-Market-Prediction\Final_runs_csv")

# List of file names
file_names = [
    "catboost_trial57_predictions",
    "cnn_predictions",
    "gru_trial28_predictions",
    "lgbm_predictions_formatted_backup",
    "logisticreg_validation_predictions",
    "lstm_test_predictions",
    "RandomForest_predictions_custom_high_precision",
    "TCN_Trial_36_predictions",
    "xgboost_predictions_fixed"
]

# Store results
results = {}
all_columns = []

print("=" * 80)
print("COMPREHENSIVE PREDICTION FILES ANALYSIS")
print("=" * 80)

# Process each file
for file_name in file_names:
    file_path = base_path / f"{file_name}.csv"
    
    print(f"\n{'─' * 60}")
    print(f"📄 {file_name}")
    print(f"{'─' * 60}")
    
    try:
        # Read CSV
        df = pd.read_csv(file_path)
        
        # Get columns
        columns = list(df.columns)
        all_columns.append(set(columns))
        
        # Parse timestamp column
        timestamp_col = 'timestamp'
        if timestamp_col in df.columns:
            # Try to parse timestamps
            try:
                df[timestamp_col] = pd.to_datetime(df[timestamp_col])
                
                # Get date range
                min_date = df[timestamp_col].min()
                max_date = df[timestamp_col].max()
                date_range_days = (max_date - min_date).days
                
                # Store results
                results[file_name] = {
                    'status': 'success',
                    'columns': columns,
                    'num_rows': len(df),
                    'min_date': min_date,
                    'max_date': max_date,
                    'date_range_days': date_range_days,
                    'has_nulls': df.isnull().any().any(),
                    'null_counts': df.isnull().sum().to_dict() if df.isnull().any().any() else {}
                }
                
                # Print summary
                print(f"✅ Successfully loaded")
                print(f"   Rows: {len(df):,}")
                print(f"   Columns: {columns}")
                print(f"   Date range: {min_date} to {max_date}")
                print(f"   Duration: {date_range_days} days")
                if results[file_name]['has_nulls']:
                    print(f"   ⚠️  Contains null values: {results[file_name]['null_counts']}")
                
            except Exception as e:
                results[file_name] = {
                    'status': 'timestamp_error',
                    'error': str(e),
                    'columns': columns,
                    'num_rows': len(df)
                }
                print(f"❌ Timestamp parsing error: {e}")
                
        else:
            results[file_name] = {
                'status': 'no_timestamp',
                'columns': columns,
                'num_rows': len(df)
            }
            print(f"❌ No timestamp column found")
            
    except FileNotFoundError:
        results[file_name] = {'status': 'not_found'}
        print(f"❌ File not found")
    except Exception as e:
        results[file_name] = {'status': 'error', 'error': str(e)}
        print(f"❌ Error: {e}")

# Column consistency check
print(f"\n{'=' * 80}")
print("COLUMN CONSISTENCY ANALYSIS")
print(f"{'=' * 80}")

if all_columns:
    # Check if all have same columns
    first_cols = all_columns[0]
    all_same = all(cols == first_cols for cols in all_columns)
    
    if all_same:
        print("\n✅ All files have IDENTICAL column structure!")
        print(f"   Columns: {sorted(first_cols)}")
    else:
        print("\n❌ Files have DIFFERENT column structures!")
        
        # Find common columns
        common_cols = set.intersection(*all_columns) if all_columns else set()
        print(f"\n   Common columns: {sorted(common_cols)}")
        
        # Show differences
        print("\n   Differences by file:")
        for file_name, result in results.items():
            if result['status'] == 'success' and 'columns' in result:
                file_cols = set(result['columns'])
                unique_cols = file_cols - common_cols
                missing_cols = first_cols - file_cols
                
                if unique_cols or missing_cols:
                    print(f"\n   {file_name}:")
                    if unique_cols:
                        print(f"     + Unique: {sorted(unique_cols)}")
                    if missing_cols:
                        print(f"     - Missing: {sorted(missing_cols)}")

# Date range analysis
print(f"\n{'=' * 80}")
print("DATE RANGE ANALYSIS")
print(f"{'=' * 80}")

successful_files = [(name, res) for name, res in results.items() 
                   if res['status'] == 'success' and 'min_date' in res]

if successful_files:
    # Sort by start date
    successful_files.sort(key=lambda x: x[1]['min_date'])
    
    print("\n📅 Date Ranges (sorted by start date):")
    print(f"{'File':<50} {'Start Date':<20} {'End Date':<20} {'Days':<10} {'Rows':<10}")
    print("─" * 110)
    
    for file_name, info in successful_files:
        print(f"{file_name:<50} {str(info['min_date']):<20} {str(info['max_date']):<20} "
              f"{info['date_range_days']:<10} {info['num_rows']:<10,}")
    
    # Find common date range
    all_starts = [info['min_date'] for _, info in successful_files]
    all_ends = [info['max_date'] for _, info in successful_files]
    
    common_start = max(all_starts)
    common_end = min(all_ends)
    common_days = (common_end - common_start).days
    
    print(f"\n📊 Common Date Range (overlap of all files):")
    print(f"   Start: {common_start}")
    print(f"   End: {common_end}")
    print(f"   Duration: {common_days} days")
    
    if common_days < 0:
        print("\n   ⚠️  WARNING: No common overlap period! Files don't share any dates.")
    else:
        # Calculate how many rows each file would have in common range
        print(f"\n   Estimated rows in common range:")
        for file_name, info in successful_files:
            # Read the file again to count rows in common range
            file_path = base_path / f"{file_name}.csv"
            df = pd.read_csv(file_path)
            df['timestamp'] = pd.to_datetime(df['timestamp'])
            common_rows = len(df[(df['timestamp'] >= common_start) & (df['timestamp'] <= common_end)])
            print(f"     {file_name}: {common_rows:,} rows")

# Summary
print(f"\n{'=' * 80}")
print("SUMMARY")
print(f"{'=' * 80}")

total_files = len(file_names)
successful = sum(1 for r in results.values() if r['status'] == 'success')
failed = total_files - successful

print(f"\n📊 File Status:")
print(f"   Total files: {total_files}")
print(f"   ✅ Successful: {successful}")
print(f"   ❌ Failed: {failed}")

if failed > 0:
    print(f"\n   Failed files:")
    for name, res in results.items():
        if res['status'] != 'success':
            print(f"     - {name}: {res['status']}")
            if 'error' in res:
                print(f"       Error: {res['error']}")

# Recommendations
print(f"\n{'=' * 80}")
print("RECOMMENDATIONS FOR ENSEMBLE")
print(f"{'=' * 80}")

if all_same and common_days > 0:
    print("\n✅ All files are ready for ensemble methods!")
    print(f"   - Use date range: {common_start} to {common_end}")
    print(f"   - This ensures all models have predictions for the same period")
elif not all_same:
    print("\n⚠️  Fix column inconsistencies before ensemble")
elif common_days <= 0:
    print("\n⚠️  No common date overlap - check your train/test splits")

# Save detailed report
report_path = base_path / "predictions_analysis_report.txt"
with open(report_path, 'w') as f:
    f.write("PREDICTION FILES ANALYSIS REPORT\n")
    f.write(f"Generated: {datetime.now()}\n")
    f.write("=" * 80 + "\n\n")
    
    for file_name, result in results.items():
        f.write(f"{file_name}:\n")
        for key, value in result.items():
            f.write(f"  {key}: {value}\n")
        f.write("\n")

print(f"\n💾 Detailed report saved to: predictions_analysis_report.txt")

COMPREHENSIVE PREDICTION FILES ANALYSIS

────────────────────────────────────────────────────────────
📄 catboost_trial57_predictions
────────────────────────────────────────────────────────────
✅ Successfully loaded
   Rows: 3,171
   Columns: ['timestamp', 'prob_up', 'prob_down', 'winning_prob', 'prediction', 'actual']
   Date range: 2023-10-16 16:00:00 to 2025-03-28 00:00:00
   Duration: 528 days

────────────────────────────────────────────────────────────
📄 cnn_predictions
────────────────────────────────────────────────────────────
✅ Successfully loaded
   Rows: 3,147
   Columns: ['timestamp', 'prob_up', 'prob_down', 'winning_prob', 'prediction', 'actual']
   Date range: 2023-10-20 16:00:00 to 2025-03-28 00:00:00
   Duration: 524 days

────────────────────────────────────────────────────────────
📄 gru_trial28_predictions
────────────────────────────────────────────────────────────
✅ Successfully loaded
   Rows: 2,266
   Columns: ['timestamp', 'prob_up', 'prob_down', 'winning_prob',

  df[timestamp_col] = pd.to_datetime(df[timestamp_col])
  df[timestamp_col] = pd.to_datetime(df[timestamp_col])
  df[timestamp_col] = pd.to_datetime(df[timestamp_col])
  df['timestamp'] = pd.to_datetime(df['timestamp'])
  df['timestamp'] = pd.to_datetime(df['timestamp'])
  df['timestamp'] = pd.to_datetime(df['timestamp'])


In [None]:
'''
Final problems list

1)lgbm has a column name mismatch
2) xgboost has a column name mismatch

3) gru has bad validation date start
4) lgbm has no date start at all

5) CNN-LSTM hybrid model has no predictions file at all - need to re run the model on a better combination then make the csv
'''

# Part 1

In [10]:
import pandas as pd
import os

# Define the path
base_path = r"C:\Users\ADMIN\Desktop\Coding_projects\stock_market_prediction\Stock-Market-Prediction\Final_runs_csv"

# Files with per-file date parsing options
files_to_fix = {
    "lgbm_predictions_formatted": {"dayfirst": True},
    "xgboost_predictions": {"dayfirst": False}  # assumed US format
}

# Standard column order expected for ensemble
standard_cols = ['timestamp', 'prob_up', 'prob_down', 'winning_prob', 'prediction', 'actual']

print("=" * 80)
print("FIXING DATE FORMAT + COLUMN ORDER")
print("=" * 80)

for file_name, parse_opts in files_to_fix.items():
    file_path = os.path.join(base_path, f"{file_name}.csv")
    
    try:
        print(f"\nProcessing {file_name}...")
        df = pd.read_csv(file_path)

        # Find timestamp column
        timestamp_col = next((col for col in df.columns if 'timestamp' in col.lower() or 'date' in col.lower()), None)
        if not timestamp_col:
            print(f"✗ No timestamp column found.")
            continue

        # Show current format and parse
        print(f"Original format sample: {df[timestamp_col].iloc[0]}")
        df[timestamp_col] = pd.to_datetime(df[timestamp_col], **parse_opts)
        print(f"Converted: {df[timestamp_col].iloc[0]}")
        print(f"Date range: {df[timestamp_col].min()} to {df[timestamp_col].max()}")

        # Backup original
        backup_path = file_path.replace(".csv", "_backup.csv")
        df.to_csv(backup_path, index=False)
        print(f"✓ Backup saved to: {os.path.basename(backup_path)}")

        # Reorder if all standard columns are present
        if set(standard_cols).issubset(df.columns):
            df = df[standard_cols]
        else:
            print("⚠️ Skipped column reordering: missing expected columns.")

        # Save final
        df.to_csv(file_path, index=False)
        print(f"✓ Fixed and saved: {file_name}.csv")

    except Exception as e:
        print(f"✗ Error processing {file_name}: {str(e)}")

print("\n" + "=" * 80)
print("All fixes complete. You're ready to run ensemble voting!")
print("=" * 80)


FIXING DATE FORMAT + COLUMN ORDER

Processing lgbm_predictions_formatted...
Original format sample: 08/08/2023 00:00
Converted: 2023-08-08 00:00:00
Date range: 2023-08-08 00:00:00 to 2025-03-28 00:00:00
✓ Backup saved to: lgbm_predictions_formatted_backup.csv
✗ Error processing lgbm_predictions_formatted: [Errno 13] Permission denied: 'C:\\Users\\ADMIN\\Desktop\\Coding_projects\\stock_market_prediction\\Stock-Market-Prediction\\Final_runs_csv\\lgbm_predictions_formatted.csv'

Processing xgboost_predictions...
Original format sample: 07/05/2023 12:00
✗ Error processing xgboost_predictions: time data "13/05/2023 00:00" doesn't match format "%m/%d/%Y %H:%M", at position 33. You might want to try:
    - passing `format` if your strings have a consistent format;
    - passing `format='ISO8601'` if your strings are all ISO8601 but not necessarily in exactly the same format;
    - passing `format='mixed'`, and the format will be inferred for each element individually. You might want to use `d

In [12]:
import pandas as pd
from pathlib import Path

def check_timestamp_csv(csv_path: str, dayfirst=False):
    path = Path(csv_path)
    print(f"\n🔍 Checking {path.name}")
    try:
        # read only timestamp column
        ts = pd.read_csv(path, usecols=['timestamp'])['timestamp']
    except ValueError:
        print("❌  No column literally named 'timestamp'. Double-check header.")
        return
    
    # parse
    ts_parsed = pd.to_datetime(ts, dayfirst=dayfirst, errors='coerce', utc=True)
    
    bad = ts_parsed.isna().sum()
    print(f"   → {bad:,} of {len(ts_parsed):,} rows failed to parse")
    
    # show a few examples
    print("\n   Raw strings (first 5):")
    print(ts.head().tolist())
    print("\n   Parsed values (first 5):")
    print(ts_parsed.head().tolist())
    
    if bad == 0:
        print(f"\n   ✅ earliest timestamp: {ts_parsed.min()}")
    else:
        # show up to 5 examples of unparsable rows
        bad_examples = ts[ts_parsed.isna()].head().tolist()
        print("\n   ❌ examples of unparsable values:")
        for s in bad_examples:
            print("      ", s)

# ----------------------------------------------------------------------
BASE = Path(r"C:\Users\ADMIN\Desktop\Coding_projects\stock_market_prediction"
            r"\Stock-Market-Prediction\Final_runs_csv")

check_timestamp_csv(BASE / "lgbm_predictions_formatted.csv", dayfirst=True)
check_timestamp_csv(BASE / "xgboost_predictions.csv", dayfirst=False)   # try False first
check_timestamp_csv(BASE / "xgboost_predictions.csv", dayfirst=True)    # …then True



🔍 Checking lgbm_predictions_formatted.csv
   → 0 of 3,589 rows failed to parse

   Raw strings (first 5):
['08/08/2023 00:00', '08/08/2023 04:00', '08/08/2023 08:00', '08/08/2023 12:00', '08/08/2023 16:00']

   Parsed values (first 5):
[Timestamp('2023-08-08 00:00:00+0000', tz='UTC'), Timestamp('2023-08-08 04:00:00+0000', tz='UTC'), Timestamp('2023-08-08 08:00:00+0000', tz='UTC'), Timestamp('2023-08-08 12:00:00+0000', tz='UTC'), Timestamp('2023-08-08 16:00:00+0000', tz='UTC')]

   ✅ earliest timestamp: 2023-08-08 00:00:00+00:00

🔍 Checking xgboost_predictions.csv
   → 2,527 of 4,144 rows failed to parse

   Raw strings (first 5):
['07/05/2023 12:00', '07/05/2023 16:00', '07/05/2023 20:00', '08/05/2023 00:00', '08/05/2023 04:00']

   Parsed values (first 5):
[Timestamp('2023-07-05 12:00:00+0000', tz='UTC'), Timestamp('2023-07-05 16:00:00+0000', tz='UTC'), Timestamp('2023-07-05 20:00:00+0000', tz='UTC'), Timestamp('2023-08-05 00:00:00+0000', tz='UTC'), Timestamp('2023-08-05 04:00:00+000

# Part 2