In [2]:
import pandas as pd
import numpy as np
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import TimeSeriesSplit, train_test_split
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
from sklearn.preprocessing import StandardScaler
import matplotlib.pyplot as plt
import warnings
warnings.filterwarnings('ignore')

def optimize_for_2000_samples(df):
    """
    Optimized strategy specifically for 2000 sample financial dataset
    """
    
    # 1. DATA CLEANING AND PREPARATION
    print("=" * 60)
    print("DATASET ANALYSIS FOR 2000 SAMPLES")
    print("=" * 60)
    
    # Convert Date and sort
    df['Date'] = pd.to_datetime(df['Date'])
    df = df.sort_values('Date').reset_index(drop=True)
    
    # Check data quality
    print(f"Original dataset shape: {df.shape}")
    print(f"Missing values per column:")
    missing_info = df.isnull().sum()
    for col, missing in missing_info.items():
        if missing > 0:
            print(f"  {col}: {missing} ({missing/len(df)*100:.1f}%)")
    
    # 2. ENHANCED FEATURE ENGINEERING
    print(f"\nCreating enhanced features...")
    
    # Basic price features
    df['Price_Change'] = df['Close'].pct_change()
    df['Volume_Change'] = df['Volume'].pct_change()
    df['High_Low_Ratio'] = df['High'] / df['Low']
    df['Open_Close_Ratio'] = df['Open'] / df['Close']
    df['Price_Position'] = (df['Close'] - df['Low']) / (df['High'] - df['Low'])
    
    # Moving averages (use shorter windows for 2000 samples)
    for window in [3, 5, 10, 20]:
        df[f'SMA_{window}'] = df['Close'].rolling(window=window).mean()
        df[f'Price_Above_SMA{window}'] = (df['Close'] > df[f'SMA_{window}']).astype(int)
    
    # Volatility indicators
    df['Volatility_5'] = df['Price_Change'].rolling(window=5).std()
    df['Volatility_10'] = df['Price_Change'].rolling(window=10).std()
    
    # Momentum indicators (shorter periods for 2000 samples)
    for period in [2, 3, 5]:
        df[f'Price_Momentum_{period}'] = df['Close'] / df['Close'].shift(period)
        df[f'Volume_Momentum_{period}'] = df['Volume'] / df['Volume'].shift(period)
    
    # Previous day indicators
    df['Prev_Day_Up'] = (df['Close'] > df['Close'].shift(1)).astype(int)
    df['Prev_Day_Volume_Ratio'] = df['Volume'] / df['Volume'].shift(1)
    
    # RSI (14-period)
    delta = df['Close'].diff()
    gain = (delta.where(delta > 0, 0)).rolling(window=14).mean()
    loss = (-delta.where(delta < 0, 0)).rolling(window=14).mean()
    rs = gain / loss
    df['RSI'] = 100 - (100 / (1 + rs))
    
    # Bollinger Bands position
    bb_period = 20
    df['BB_Middle'] = df['Close'].rolling(window=bb_period).mean()
    bb_std = df['Close'].rolling(window=bb_period).std()
    df['BB_Upper'] = df['BB_Middle'] + (bb_std * 2)
    df['BB_Lower'] = df['BB_Middle'] - (bb_std * 2)
    df['BB_Position'] = (df['Close'] - df['BB_Lower']) / (df['BB_Upper'] - df['BB_Lower'])
    
    # 3. TARGET VARIABLE CREATION
    df['Next_Day_Close'] = df['Close'].shift(-1)
    df['Price_Change_Pct'] = (df['Next_Day_Close'] - df['Close']) / df['Close']
    
    # Multi-class with optimized thresholds for 2000 samples
    threshold = 0.01  # 1% threshold
    df['Price_Direction'] = np.where(df['Price_Change_Pct'] > threshold, 2,
                                   np.where(df['Price_Change_Pct'] < -threshold, 0, 1))
    
    # 4. FEATURE SELECTION
    features = [
        'Open', 'High', 'Low', 'Volume',
        'Price_Change', 'Volume_Change',
        'High_Low_Ratio', 'Open_Close_Ratio', 'Price_Position',
        'Price_Above_SMA3', 'Price_Above_SMA5', 'Price_Above_SMA10',
        'Volatility_5', 'Volatility_10',
        'Price_Momentum_2', 'Price_Momentum_3', 'Price_Momentum_5',
        'Volume_Momentum_2', 'Volume_Momentum_3', 'Volume_Momentum_5',
        'Prev_Day_Up', 'Prev_Day_Volume_Ratio',
        'RSI', 'BB_Position'
    ]
    
    # Clean data and prepare features
    df_clean = df.dropna()
    print(f"Dataset shape after cleaning: {df_clean.shape}")
    print(f"Samples lost due to feature engineering: {len(df) - len(df_clean)}")
    
    # Select available features
    available_features = [f for f in features if f in df_clean.columns]
    print(f"Available features: {len(available_features)}")
    
    X = df_clean[available_features]
    y = df_clean['Price_Direction']
    
    # Check class distribution
    print(f"\nClass distribution:")
    class_dist = y.value_counts(normalize=True).sort_index()
    for class_idx, proportion in class_dist.items():
        class_name = ['Down', 'Sideways', 'Up'][class_idx]
        print(f"  {class_name}: {proportion:.3f} ({y.value_counts()[class_idx]} samples)")
    
    return df_clean, X, y, available_features

def multiple_evaluation_strategies(X, y, df_clean):
    """
    Evaluate multiple train/test strategies for 2000 samples
    """
    
    print("\n" + "=" * 60)
    print("MULTIPLE EVALUATION STRATEGIES")
    print("=" * 60)
    
    results = {}
    
    # Model configuration optimized for 2000 samples
    model_config = {
        'max_depth': 8,              # Reduced from 10
        'min_samples_split': 10,     # Reduced from 15
        'min_samples_leaf': 5,       # Reduced from 8
        'max_features': 'sqrt',
        'class_weight': 'balanced',
        'random_state': 42
    }
    
    # Strategy 1: Standard 75/25 split (better for 2000 samples)
    print("1. Standard 75/25 Time-based Split")
    split_index = int(len(X) * 0.75)  # 1500 train, 500 test
    
    X_train = X[:split_index]
    X_test = X[split_index:]
    y_train = y[:split_index]
    y_test = y[split_index:]
    
    print(f"   Training: {len(X_train)} samples")
    print(f"   Testing: {len(X_test)} samples")
    
    model = DecisionTreeClassifier(**model_config)
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    results['standard_75_25'] = accuracy_score(y_test, y_pred)
    
    # Strategy 2: 80/20 split (your current approach)
    print("\n2. Current 80/20 Time-based Split")
    split_index = int(len(X) * 0.80)  # 1600 train, 400 test
    
    X_train_80 = X[:split_index]
    X_test_80 = X[split_index:]
    y_train_80 = y[:split_index]
    y_test_80 = y[split_index:]
    
    print(f"   Training: {len(X_train_80)} samples")
    print(f"   Testing: {len(X_test_80)} samples")
    
    model.fit(X_train_80, y_train_80)
    y_pred_80 = model.predict(X_test_80)
    results['current_80_20'] = accuracy_score(y_test_80, y_pred_80)
    
    # Strategy 3: Time Series Cross-Validation (5-fold)
    print("\n3. Time Series Cross-Validation (5-fold)")
    tscv = TimeSeriesSplit(n_splits=5)
    cv_scores = []
    
    fold_info = []
    for i, (train_idx, test_idx) in enumerate(tscv.split(X)):
        X_train_cv = X.iloc[train_idx]
        X_test_cv = X.iloc[test_idx]
        y_train_cv = y.iloc[train_idx]
        y_test_cv = y.iloc[test_idx]
        
        model.fit(X_train_cv, y_train_cv)
        y_pred_cv = model.predict(X_test_cv)
        score = accuracy_score(y_test_cv, y_pred_cv)
        cv_scores.append(score)
        
        fold_info.append({
            'fold': i+1,
            'train_size': len(train_idx),
            'test_size': len(test_idx),
            'accuracy': score
        })
    
    for info in fold_info:
        print(f"   Fold {info['fold']}: Train={info['train_size']}, Test={info['test_size']}, Acc={info['accuracy']:.4f}")
    
    results['time_series_cv'] = np.mean(cv_scores)
    results['time_series_cv_std'] = np.std(cv_scores)
    
    # Strategy 4: Rolling Window (1 year train, 3 months test)
    print("\n4. Rolling Window (252 train, 63 test)")
    window_size = 252  # 1 year
    test_size = 63     # 3 months
    
    rolling_scores = []
    valid_windows = 0
    
    for i in range(window_size, len(X) - test_size + 1, test_size):
        if i + test_size <= len(X):
            train_start = i - window_size
            train_end = i
            test_start = i
            test_end = i + test_size
            
            X_train_roll = X.iloc[train_start:train_end]
            X_test_roll = X.iloc[test_start:test_end]
            y_train_roll = y.iloc[train_start:train_end]
            y_test_roll = y.iloc[test_start:test_end]
            
            model.fit(X_train_roll, y_train_roll)
            y_pred_roll = model.predict(X_test_roll)
            score = accuracy_score(y_test_roll, y_pred_roll)
            rolling_scores.append(score)
            valid_windows += 1
    
    print(f"   Valid windows: {valid_windows}")
    if rolling_scores:
        results['rolling_window'] = np.mean(rolling_scores)
        print(f"   Window scores: {[f'{score:.4f}' for score in rolling_scores]}")
    else:
        results['rolling_window'] = None
        print("   Not enough data for rolling window")
    
    # Strategy 5: Expanding Window
    print("\n5. Expanding Window")
    initial_train = 500   # Start with 500 samples
    test_size = 100       # Test on 100 samples
    
    expanding_scores = []
    valid_expansions = 0
    
    for i in range(initial_train, len(X) - test_size + 1, test_size):
        if i + test_size <= len(X):
            X_train_exp = X.iloc[0:i]
            X_test_exp = X.iloc[i:i+test_size]
            y_train_exp = y.iloc[0:i]
            y_test_exp = y.iloc[i:i+test_size]
            
            model.fit(X_train_exp, y_train_exp)
            y_pred_exp = model.predict(X_test_exp)
            score = accuracy_score(y_test_exp, y_pred_exp)
            expanding_scores.append(score)
            valid_expansions += 1
    
    print(f"   Valid expansions: {valid_expansions}")
    if expanding_scores:
        results['expanding_window'] = np.mean(expanding_scores)
        print(f"   Expansion scores: {[f'{score:.4f}' for score in expanding_scores]}")
    else:
        results['expanding_window'] = None
        print("   Not enough data for expanding window")
    
    return results, model, X_train, X_test, y_train, y_test

def detailed_analysis(model, X_train, X_test, y_train, y_test, available_features):
    """
    Detailed analysis of the best model
    """
    
    print("\n" + "=" * 60)
    print("DETAILED MODEL ANALYSIS")
    print("=" * 60)
    
    # Predictions
    y_pred = model.predict(X_test)
    y_pred_proba = model.predict_proba(X_test)
    
    # Basic metrics
    accuracy = accuracy_score(y_test, y_pred)
    print(f"Test Accuracy: {accuracy:.4f}")
    
    # Classification report
    target_names = ['Down', 'Sideways', 'Up']
    print(f"\nClassification Report:")
    print(classification_report(y_test, y_pred, target_names=target_names))
    
    # Confusion matrix
    cm = confusion_matrix(y_test, y_pred)
    print(f"\nConfusion Matrix:")
    print("         Down  Sideways  Up")
    for i, row in enumerate(cm):
        print(f"{target_names[i]:>8}: {row}")
    
    # Feature importance
    feature_importance = pd.DataFrame({
        'feature': available_features,
        'importance': model.feature_importances_
    }).sort_values('importance', ascending=False)
    
    print(f"\nTop 10 Most Important Features:")
    for i, (_, row) in enumerate(feature_importance.head(10).iterrows()):
        print(f"{i+1:2d}. {row['feature']:<20} {row['importance']:.4f}")
    
    # Trading strategy analysis
    print(f"\nTrading Strategy Analysis:")
    
    # High confidence predictions
    max_proba = y_pred_proba.max(axis=1)
    
    for confidence_threshold in [0.5, 0.6, 0.7]:
        high_confidence = max_proba > confidence_threshold
        if high_confidence.sum() > 0:
            high_conf_accuracy = (y_test[high_confidence] == y_pred[high_confidence]).mean()
            print(f"Confidence >{confidence_threshold:.1f}: {high_confidence.sum()}/{len(y_test)} predictions, Accuracy: {high_conf_accuracy:.4f}")
    
    # Class-specific performance
    print(f"\nClass-specific Trading Performance:")
    for class_idx, class_name in enumerate(target_names):
        class_predictions = (y_pred == class_idx)
        if class_predictions.sum() > 0:
            class_accuracy = (y_test[class_predictions] == class_idx).mean()
            print(f"{class_name:>8} predictions: {class_predictions.sum()}/{len(y_test)} ({class_predictions.mean():.2%}), Accuracy: {class_accuracy:.4f}")

# Main execution
def main():
    # Load your dataset
    df = pd.read_csv('D:\\ML PROJECTS\\AML lab\\augmented_financial_data.csv')
    
    # Optimize for 2000 samples
    df_clean, X, y, available_features = optimize_for_2000_samples(df)
    
    # Multiple evaluation strategies
    results, best_model, X_train, X_test, y_train, y_test = multiple_evaluation_strategies(X, y, df_clean)
    
    # Display results summary
    print("\n" + "=" * 60)
    print("RESULTS SUMMARY")
    print("=" * 60)
    
    for strategy, score in results.items():
        if score is not None:
            if strategy == 'time_series_cv':
                print(f"{strategy:<20}: {score:.4f} (±{results['time_series_cv_std']:.4f})")
            else:
                print(f"{strategy:<20}: {score:.4f}")
        else:
            print(f"{strategy:<20}: Not applicable")
    
    # Recommendations
    print(f"\n" + "=" * 60)
    print("RECOMMENDATIONS FOR 2000 SAMPLES")
    print("=" * 60)
    
    print("1. OPTIMAL SPLIT STRATEGY:")
    print("   - Use 75/25 split (1500 train, 500 test)")
    print("   - This gives you more test samples for robust evaluation")
    
    print("\n2. CROSS-VALIDATION:")
    print("   - Use Time Series Cross-Validation with 5 folds")
    print("   - This gives you multiple evaluation points")
    
    print("\n3. MODEL PARAMETERS:")
    print("   - max_depth=8 (reduced from 10)")
    print("   - min_samples_split=10 (reduced from 15)")
    print("   - min_samples_leaf=5 (reduced from 8)")
    
    print("\n4. FEATURE ENGINEERING:")
    print("   - Use shorter time windows (3, 5, 10 days)")
    print("   - Focus on momentum and volatility indicators")
    
    # Detailed analysis
    detailed_analysis(best_model, X_train, X_test, y_train, y_test, available_features)

if __name__ == "__main__":
    main()

DATASET ANALYSIS FOR 2000 SAMPLES
Original dataset shape: (2000, 10)
Missing values per column:
  returns: 260 (13.0%)
  ma_5: 1650 (82.5%)
  ma_20: 1650 (82.5%)

Creating enhanced features...
Dataset shape after cleaning: (347, 41)
Samples lost due to feature engineering: 1653
Available features: 24

Class distribution:
  Down: 0.473 (164 samples)
  Sideways: 0.069 (24 samples)
  Up: 0.458 (159 samples)

MULTIPLE EVALUATION STRATEGIES
1. Standard 75/25 Time-based Split
   Training: 260 samples
   Testing: 87 samples

2. Current 80/20 Time-based Split
   Training: 277 samples
   Testing: 70 samples

3. Time Series Cross-Validation (5-fold)
   Fold 1: Train=62, Test=57, Acc=0.7895
   Fold 2: Train=119, Test=57, Acc=0.5088
   Fold 3: Train=176, Test=57, Acc=0.5789
   Fold 4: Train=233, Test=57, Acc=0.5263
   Fold 5: Train=290, Test=57, Acc=0.6140

4. Rolling Window (252 train, 63 test)
   Valid windows: 1
   Window scores: ['0.4286']

5. Expanding Window
   Valid expansions: 0
   Not eno