In [2]:
import backtrader as bt
import yfinance as yf
import numpy as np
from sklearn.linear_model import LogisticRegression
from xgboost import XGBClassifier
from sklearn.model_selection import train_test_split
import sys
import pandas as pd
import os
import tempfile
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score, confusion_matrix

In [4]:
class CustomCSVData(bt.feeds.GenericCSVData):
    params = (
        ('datetime', None),
        ('open', -1),
        ('high', -1),
        ('low', -1),
        ('close', -1),
        ('volume', -1),
        ('openinterest', -1),
        ('dtformat', ('%Y-%m-%d')),
    )
    
    def __init__(self, file_type):
        self.file_type = file_type
        df = pd.read_csv(self.p.dataname, parse_dates=['Date'], 
                         dayfirst=True if file_type != '002054.XSHE' else False,
                         header=None if file_type == '002054.XSHE' else 0,
                         names=['Date', 'open', 'close', 
                                'high', 'low', 'volume', 
                                'money', 'avg', 'high_limit', 
                                'low_limit', 'pre_close', 'paused', 
                                'factor'] if file_type == '002054.XSHE' else None,
                         skiprows=1 if file_type == '002054.XSHE' else 0,
                         date_format='%Y-%m-%d' if file_type == '002054.XSHE' else '%m/%d/%y')
        df.sort_values(by='Date', inplace=True)


        temp_file = tempfile.NamedTemporaryFile(delete=False, mode='w', suffix='.csv')
        df.to_csv(temp_file.name, index=False)
        self.p.dataname = temp_file.name
        super().__init__()

        # Define a dictionary to map file types to their parameters
        file_type_params = {
            '002054.XSHE': {'datetime': 0, 'open': 1, 'high': 2, 'low': 3, 
                            'close': 4, 'volume': 5},
            'aapl': {'datetime': 0, 'open': 1, 'high': 2, 'low': 3, 
                     'close': 4, 'volume': 5, 'openinterest': -1},
            'ERCOTDA_price': {'datetime': 0, 'hour_of_day': 1, 'close': 2}
        }

        # Apply the parameters based on file_type
        params = file_type_params.get(file_type, {})
        for param, value in params.items():
            setattr(self.p, param, value)

        super().__init__()

    def stop(self):
        os.remove(self.p.dataname)

class PrintDataStrategy(bt.Strategy):
    def __init__(self):
        self.bar_counter = 0

    def next(self):
        num_bars_to_print = 5
        if self.bar_counter < num_bars_to_print:
            print(self.data.datetime.date(0), self.data.open[0], 
                  self.data.high[0], self.data.low[0], 
                  self.data.close[0], self.data.volume[0])
            self.bar_counter += 1

def run_print(data):
    print(data.file_type)
    print("Date: Open: High: Low: Close: Volume:")
    cerebro = bt.Cerebro()
    cerebro.addstrategy(PrintDataStrategy)
    cerebro.adddata(data)
    cerebro.run()
    print("...\n")

# Run backtest for each dataset
datasets = ['002054.XSHE', 'aapl', 'ERCOTDA_price']
for dataset in datasets:
    file_name = f'./3csv/{dataset}.csv'
    data = CustomCSVData(dataname=file_name, file_type=dataset)
    run_print(data)

TypeError: read_csv() got an unexpected keyword argument 'date_format'

In [41]:
# Define the parameter grid
param_grid_lr = {
    'penalty': ['l1', 'l2'],
    'C': [0.1, 1, 10],
    'solver': ['liblinear'],
    'max_iter': [1000]  
}

# Create a GridSearchCV object
grid_lr = GridSearchCV(LogisticRegression(), param_grid_lr, cv=5, scoring='accuracy')
grid_lr.fit(X_train, y_train)

print("Best Parameters for Logistic Regression:", grid_lr.best_params_)
print("Best Score for Logistic Regression:", grid_lr.best_score_)

param_grid_xgb = {
    'n_estimators': [5, 50],
    'learning_rate': [0.01, 0.05, 0.1],
    'max_depth': [3, 10]
}

grid_xgb = GridSearchCV(XGBClassifier(use_label_encoder=False, eval_metric='logloss'), param_grid_xgb, cv=5, scoring='accuracy')
grid_xgb.fit(X_train, y_train)

# Best parameters and best score
print("Best Parameters for XGBoost:", grid_xgb.best_params_)
print("Best Score for XGBoost:", grid_xgb.best_score_)

# Get the best estimators
best_log_reg = grid_lr.best_estimator_
best_xgb = grid_xgb.best_estimator_

def evaluate_model(name, model, X_test, y_test):
    predictions = model.predict(X_test)
    accuracy = accuracy_score(y_test, predictions)
    precision = precision_score(y_test, predictions)
    recall = recall_score(y_test, predictions)
    f1 = f1_score(y_test, predictions)
    roc_auc = roc_auc_score(y_test, predictions)

    print(f"Model: {name}")
    print(f"Accuracy: {accuracy:.4f}")
    print(f"Precision: {precision:.4f}")
    print(f"Recall: {recall:.4f}")
    print(f"F1 Score: {f1:.4f}")
    print(f"ROC AUC Score: {roc_auc:.4f}\n")

# Evaluate Logistic Regression
evaluate_model("Logistic Regression", best_log_reg, X_test, y_test)
# Evaluate XGBoost
evaluate_model("XGBoost", best_xgb, X_test, y_test)

Best Parameters for Logistic Regression: {'C': 10, 'max_iter': 1000, 'penalty': 'l1', 'solver': 'liblinear'}
Best Score for Logistic Regression: 0.6962901266701371
Best Parameters for XGBoost: {'learning_rate': 0.1, 'max_depth': 10, 'n_estimators': 50}
Best Score for XGBoost: 0.6574492451847995
Model: Logistic Regression
Accuracy: 0.6920
Precision: 0.6855
Recall: 0.7340
F1 Score: 0.7089
ROC AUC Score: 0.6910

Model: XGBoost
Accuracy: 0.6514
Precision: 0.6484
Recall: 0.6943
F1 Score: 0.6706
ROC AUC Score: 0.6504



In [47]:
# Backtrader strategy
class MLStrategy(bt.Strategy):
    def __init__(self):
        self.ma_5 = bt.indicators.SimpleMovingAverage(self.data.close, period=5)
        self.ma_10 = bt.indicators.SimpleMovingAverage(self.data.close, period=10)
        self.ma_20 = bt.indicators.SimpleMovingAverage(self.data.close, period=20)

        self.logistic_model = best_log_reg
        self.xgboost_model = best_xgb

    def next(self):
        if len(self.data) >= 50:  # Adjust based on your longest indicator
            # Convert to numpy array for TA-Lib calculations
            close_array = np.array(self.data.close.get(size=50))
            
            # Calculate the indicators
            rsi = talib.RSI(close_array, timeperiod=14)[-1]
            macd, macdsignal, _ = talib.MACD(close_array, fastperiod=12, slowperiod=26, signalperiod=9)

            # Create a DataFrame for prediction with the same feature names
            features = pd.DataFrame({
                'Open': [self.data.open[0]],
                'High': [self.data.high[0]],
                'Low': [self.data.low[0]],
                'Close': [self.data.close[0]],
                'Volume': [self.data.volume[0]],
                'MA5': [self.ma_5[0]],
                'MA10': [self.ma_10[0]],
                'MA20': [self.ma_20[0]],
                'RSI': [rsi],
                'MACD': [macd[-1]],
                'MACD_signal': [macdsignal[-1]]
            })

            # features_scaled=features
            features_scaled = pd.DataFrame(scaler.transform(features), columns=features.columns)
           
            logistic_pred = self.logistic_model.predict(features_scaled)[0]
            xgboost_pred = self.xgboost_model.predict(features_scaled)[0]
            final_prediction = round((logistic_pred + xgboost_pred) / 2)
        
            if final_prediction == 1 and not self.position:
                self.order = self.buy()
            elif final_prediction == 0 and self.position:
                self.order = self.close()

    def log(self, txt):
        dt = self.datas[0].datetime.date(0)
        print('%s, %s' % (dt.isoformat(), txt))

if __name__ == '__main__':
    # Create a cerebro instance, add our strategy, some starting cash at broker and a 0.1% broker commission
    strat1 = bt.Cerebro()
    strat1.addstrategy(MLStrategy)
    strat1.broker.setcash(100000)
    strat1.broker.setcommission(commission=0.001)
    strat1.addanalyzer(bt.analyzers.TradeAnalyzer, _name="trade_analyzer")
    strat1.addanalyzer(bt.analyzers.SharpeRatio, _name='sharperatio', riskfreerate=0.01)

    datafeed = bt.feeds.PandasData(dataname=yf.download(ticker, s, end_date, progress=False))

    strat1.adddata(datafeed)

    print('<START> Brokerage account: $%.2f' % strat1.broker.getvalue())
    results = strat1.run()
    print('<FINISH> Brokerage account: $%.2f' % strat1.broker.getvalue())
    # strat1.plot(style='candlestick', loc='grey', grid=False)  # You can leave inside the parentheses empty
    strategy = results[0]
    # print("Trade Analysis:", strategy.analyzers.trade_analyzer.get_analysis())
    print("Trade Analysis Results:")
    trade_analysis = strategy.analyzers.trade_analyzer.get_analysis()
    for key, value in trade_analysis.items():
        print(f"{key}: {value}")
    
    print("\nSharpe Ratio:")
    sharpe_ratio = strategy.analyzers.sharperatio.get_analysis()
    print(sharpe_ratio)

<START> Brokerage account: $100000.00
<FINISH> Brokerage account: $100191.98
Trade Analysis Results:
total: AutoOrderedDict([('total', 413), ('open', 0), ('closed', 413)])
streak: AutoOrderedDict([('won', AutoOrderedDict([('current', 3), ('longest', 8)])), ('lost', AutoOrderedDict([('current', 0), ('longest', 9)]))])
pnl: AutoOrderedDict([('gross', AutoOrderedDict([('total', 234.31801569461823), ('average', 0.5673559702048867)])), ('net', AutoOrderedDict([('total', 191.98233637440205), ('average', 0.4648482720929832)]))])
won: AutoOrderedDict([('total', 163), ('pnl', AutoOrderedDict([('total', 524.2677467179299), ('average', 3.216366544281778), ('max', 123.30533331298828)]))])
lost: AutoOrderedDict([('total', 250), ('pnl', AutoOrderedDict([('total', -332.2854103435277), ('average', -1.3291416413741108), ('max', -22.308037719726563)]))])
long: AutoOrderedDict([('total', 413), ('pnl', AutoOrderedDict([('total', 191.98233637440205), ('average', 0.4648482720929832), ('won', AutoOrderedDict

    Ticker  Accuracy
357    NBL  0.698465
23     EIX  0.698371
154    MYL  0.696566
107   HOLX  0.695263
16     FLT  0.692866
275    IVZ  0.692015
65     NWS  0.691124
319    RJF  0.690742
1     TROW  0.689669
62     PEG  0.687450


In [5]:
import backtrader as bt
import pandas as pd
import os
import tempfile

class CustomCSVData(bt.feeds.GenericCSVData):
    params = (
        ('datetime', None),
        ('open', -1),
        ('high', -1),
        ('low', -1),
        ('close', -1),
        ('volume', -1),
        ('openinterest', -1),
        ('dtformat', ('%Y-%m-%d')),
    )
    
    def __init__(self):
        super().__init__()

    @classmethod
    def preprocess_data(cls, file_name, file_type):
        df = pd.read_csv(file_name, parse_dates=['Date'], 
                         dayfirst=True if file_type != '002054.XSHE' else False,
                         header=None if file_type == '002054.XSHE' else 0,
                         names=['Date', 'open', 'close', 
                                'high', 'low', 'volume', 
                                'money', 'avg', 'high_limit', 
                                'low_limit', 'pre_close', 'paused', 
                                'factor'] if file_type == '002054.XSHE' else None,
                         skiprows=1 if file_type == '002054.XSHE' else 0)
        df.sort_values(by='Date', inplace=True)

        temp_file = tempfile.NamedTemporaryFile(delete=False, mode='w', suffix='.csv')
        df.to_csv(temp_file.name, index=False)
        return temp_file.name

class PrintDataStrategy(bt.Strategy):
    def __init__(self):
        self.bar_counter = 0

    def next(self):
        num_bars_to_print = 5
        if self.bar_counter < num_bars_to_print:
            print(self.data.datetime.date(0), self.data.open[0], 
                  self.data.high[0], self.data.low[0], 
                  self.data.close[0], self.data.volume[0])
            self.bar_counter += 1

def run_print(data):
    print(data.file_type)
    print("Date: Open: High: Low: Close: Volume:")
    cerebro = bt.Cerebro()
    cerebro.addstrategy(PrintDataStrategy)
    cerebro.adddata(data)
    cerebro.run()
    print("...\n")

# Run backtest for each dataset
datasets = ['002054.XSHE', 'aapl', 'ERCOTDA_price']
for dataset in datasets:
    file_name = f'./3csv/{dataset}.csv'
    preprocessed_file = CustomCSVData.preprocess_data(file_name, dataset)
    data = CustomCSVData(dataname=preprocessed_file)
    run_print(data)
    os.remove(preprocessed_file)  # Clean up temporary file


AttributeError: 'Lines_LineSeries_DataSeries_OHLC_OHLCDateTime_Abst' object has no attribute 'file_type'

In [9]:
import backtrader as bt
import pandas as pd
import os
import tempfile

class CustomCSVData(bt.feeds.GenericCSVData):
    # Adjust the 'dtformat' parameter based on your CSV file's date format
    params = (
        ('datetime', 0),
        ('open', 1),
        ('high', 2),
        ('low', 3),
        ('close', 4),
        ('volume', 5),
        ('openinterest', -1),
        ('dtformat', ('%Y-%m-%d')),  # Adjust this format if your CSV includes time
    )

    def __init__(self):
        super().__init__()

    @classmethod
    def preprocess_data(cls, file_name, file_type):
        # Reading and preprocessing the CSV file
        # Adjust this part based on the actual format of your CSV files
        df = pd.read_csv(file_name, parse_dates=['Date'],
                         dayfirst=True if file_type != '002054.XSHE' else False,
                         header=None if file_type == '002054.XSHE' else 0,
                         names=['Date', 'open', 'close', 
                                'high', 'low', 'volume', 
                                'money', 'avg', 'high_limit', 
                                'low_limit', 'pre_close', 'paused', 
                                'factor'] if file_type == '002054.XSHE' else None,
                         skiprows=1 if file_type == '002054.XSHE' else 0)
        df.sort_values(by='Date', inplace=True)

        temp_file = tempfile.NamedTemporaryFile(delete=False, mode='w', suffix='.csv')
        df.to_csv(temp_file.name, index=False)
        return temp_file.name

class PrintDataStrategy(bt.Strategy):
    def __init__(self):
        self.bar_counter = 0

    def next(self):
        num_bars_to_print = 5
        if self.bar_counter < num_bars_to_print:
            print(self.data.datetime.date(0), self.data.open[0], 
                  self.data.high[0], self.data.low[0], 
                  self.data.close[0], self.data.volume[0])
            self.bar_counter += 1

def run_print(data):
    print("Date: Open: High: Low: Close: Volume:")
    cerebro = bt.Cerebro()
    cerebro.addstrategy(PrintDataStrategy)
    cerebro.adddata(data)
    cerebro.run()
    print("...\n")

# Running backtest for each dataset
datasets = ['002054.XSHE','aapl', 'ERCOTDA_price']
for dataset in datasets:
    file_name = f'./csv_data/{dataset}.csv'  # Ensure the file path is correct
    preprocessed_file = CustomCSVData.preprocess_data(file_name, dataset)
    data = CustomCSVData(dataname=preprocessed_file)
    run_print(data)
    os.remove(preprocessed_file)  # Clean up temporary file


FileNotFoundError: [Errno 2] No such file or directory: 'aapl.csv'