In [19]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.metrics import confusion_matrix, classification_report, accuracy_score
from sklearn.ensemble import RandomForestClassifier
import optuna
import matplotlib.pyplot as plt
import seaborn as sns

In [20]:
import sys
print(sys.executable)


/home/onizuka/Documents/Project M1/venv/bin/python3.10


In [23]:
df= pd.read_csv('final_data.csv')
df.columns

Index(['Adj Close', 'Close', 'High', 'Low', 'Open', 'Volume', 'MACD_Line',
       'Signal_Line', 'RSI', 'BB_Lower', 'BB_Middle', 'BB_Upper', 'ADX',
       'future_price', 'direction'],
      dtype='object')

## Random Forest Classfifier 


In [39]:
# Step 1: Import Libraries
import pandas as pd
import numpy as np
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
import optuna

# Step 3: Define Features (X) and Target (y)
# Drop 'direction' and 'future_price' (leakage)
X = df.drop(columns=['direction', 'future_price'])
y = df['direction']

# Step 4: Split Data into Training and Testing Sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Step 5: Feature Importance
# Initialize Random Forest to get feature importance
rf = RandomForestClassifier(n_estimators=100, random_state=42)
rf.fit(X_train, y_train)

# Get feature importances
importances = rf.feature_importances_
feature_importances = pd.DataFrame({'Feature': X.columns, 'Importance': importances})
feature_importances = feature_importances.sort_values(by='Importance', ascending=False)

print("Feature Importances:\n", feature_importances)

# Step 6: Train with Important Features
# Select top 5 features (you can adjust this number)
top_features = feature_importances['Feature'].head(12).tolist()
print("Top Features:", top_features)

# Filter datasets to include only top features
X_train_top = X_train[top_features]
X_test_top = X_test[top_features]

# Step 7: Hyperparameter Optimization with Optuna
def objective(trial):
    # Define hyperparameters to optimize
    n_estimators = trial.suggest_int('n_estimators', 50, 200)
    max_depth = trial.suggest_int('max_depth', 10, 30, step=5)
    min_samples_split = trial.suggest_int('min_samples_split', 2, 10)
    min_samples_leaf = trial.suggest_int('min_samples_leaf', 1, 4)

    # Initialize the model
    model = RandomForestClassifier(
        n_estimators=n_estimators,
        max_depth=max_depth,
        min_samples_split=min_samples_split,
        min_samples_leaf=min_samples_leaf,
        random_state=42
    )

    # Evaluate the model using cross-validation
    score = cross_val_score(model, X_train_top, y_train, cv=3, scoring='accuracy').mean()
    return score

# Create an Optuna study
study = optuna.create_study(direction='maximize')
study.optimize(objective, n_trials=50)

# Get the best parameters
best_params = study.best_params
print("Best Parameters:", best_params)

# Step 8: Train the Model with Best Parameters
best_rf = RandomForestClassifier(**best_params, random_state=42)
best_rf.fit(X_train_top, y_train)

# Step 9: Evaluate the Model
# Make predictions
y_pred = best_rf.predict(X_test_top)

# Evaluate the model
print("\nTest Accuracy:", accuracy_score(y_test, y_pred))
print("\nConfusion Matrix:\n", confusion_matrix(y_test, y_pred))
print("\nClassification Report:\n", classification_report(y_test, y_pred))

# Step 10: Check for Overfitting/Underfitting
# Training accuracy
train_accuracy = accuracy_score(y_train, best_rf.predict(X_train_top))

# Test accuracy
test_accuracy = accuracy_score(y_test, y_pred)

print(f"\nTraining Accuracy: {train_accuracy}")
print(f"Test Accuracy: {test_accuracy}")

# Step 11: Interpret Overfitting/Underfitting
if train_accuracy > test_accuracy:
    print("\nThe model may be overfitting.")
elif train_accuracy < test_accuracy:
    print("\nThe model may be underfitting.")
else:
    print("\nThe model is well-fitted.")

[I 2025-01-10 22:59:44,662] A new study created in memory with name: no-name-24388726-afc4-455c-90ae-3807decc1439


Feature Importances:
         Feature  Importance
5        Volume    0.094310
8           RSI    0.092923
12          ADX    0.089855
7   Signal_Line    0.086890
6     MACD_Line    0.085278
9      BB_Lower    0.076051
11     BB_Upper    0.074927
10    BB_Middle    0.072923
4          Open    0.066609
0     Adj Close    0.066061
3           Low    0.065039
2          High    0.064967
1         Close    0.064167
Top Features: ['Volume', 'RSI', 'ADX', 'Signal_Line', 'MACD_Line', 'BB_Lower', 'BB_Upper', 'BB_Middle', 'Open', 'Adj Close', 'Low', 'High']


[I 2025-01-10 22:59:50,239] Trial 0 finished with value: 0.5184933831014592 and parameters: {'n_estimators': 94, 'max_depth': 20, 'min_samples_split': 7, 'min_samples_leaf': 4}. Best is trial 0 with value: 0.5184933831014592.
[I 2025-01-10 22:59:53,456] Trial 1 finished with value: 0.5184933831014592 and parameters: {'n_estimators': 77, 'max_depth': 10, 'min_samples_split': 2, 'min_samples_leaf': 3}. Best is trial 0 with value: 0.5184933831014592.
[I 2025-01-10 23:00:04,179] Trial 2 finished with value: 0.5181540549711571 and parameters: {'n_estimators': 199, 'max_depth': 15, 'min_samples_split': 2, 'min_samples_leaf': 3}. Best is trial 0 with value: 0.5184933831014592.
[I 2025-01-10 23:00:08,401] Trial 3 finished with value: 0.5231308675489198 and parameters: {'n_estimators': 101, 'max_depth': 10, 'min_samples_split': 7, 'min_samples_leaf': 3}. Best is trial 3 with value: 0.5231308675489198.
[I 2025-01-10 23:00:20,021] Trial 4 finished with value: 0.5167967424499491 and parameters: {'

Best Parameters: {'n_estimators': 194, 'max_depth': 10, 'min_samples_split': 9, 'min_samples_leaf': 3}

Test Accuracy: 0.5146992311171416

Confusion Matrix:
 [[591 499]
 [574 547]]

Classification Report:
               precision    recall  f1-score   support

           0       0.51      0.54      0.52      1090
           1       0.52      0.49      0.50      1121

    accuracy                           0.51      2211
   macro avg       0.52      0.52      0.51      2211
weighted avg       0.52      0.51      0.51      2211


Training Accuracy: 0.7543264336613505
Test Accuracy: 0.5146992311171416

The model may be overfitting.
