In [None]:
import numpy as np
from sklearn.svm import LinearSVC, SVC
from sklearn.linear_model import ElasticNet, SGDClassifier
from sklearn.naive_bayes import GaussianNB, BernoulliNB
from sklearn.ensemble import RandomForestClassifier, ExtraTreesClassifier, GradientBoostingClassifier, \
    HistGradientBoostingClassifier, IsolationForest, AdaBoostClassifier, BaggingClassifier
from sklearn.calibration import CalibratedClassifierCV
from datetime import datetime
from KL.kl.utils import load_fx
import matplotlib.pyplot as plt
from mrmr import mrmr_classif
import pandas as pd
import chime

In [None]:
def get_models(class_weight):
    classifiers_list = []
    classifiers_list.append(RandomForestClassifier(class_weight=class_weight))#class_weight
    classifiers_list.append(ExtraTreesClassifier(class_weight=class_weight))#class_weight
    classifiers_list.append(GradientBoostingClassifier())
    classifiers_list.append(HistGradientBoostingClassifier(class_weight=class_weight))#class_weight
    classifiers_list.append(GaussianNB())
    classifiers_list.append(BernoulliNB())
    classifiers_list.append(IsolationForest())
    classifiers_list.append(ElasticNet())
    # classifiers_list.append(KNeighborsClassifier()) 
    classifiers_list.append(LinearSVC(class_weight=class_weight))#class_weight/no predict_proba
    classifiers_list.append(SGDClassifier(class_weight=class_weight))#class_weight/no predict_proba
    classifiers_list.append(SVC(probability=True, class_weight=class_weight))#class_weight/ no predict_proba
    classifiers_list.append(AdaBoostClassifier(algorithm='SAMME', n_estimators=100))
    classifiers_list.append(BaggingClassifier(estimator=SVC(), n_estimators=100, random_state=0))
    return classifiers_list

In [None]:
def get_calibrated_models(class_weight):
    method="isotonic"
    classifiers_list = []
    classifiers_list.append(CalibratedClassifierCV(RandomForestClassifier(class_weight=class_weight), cv=5, method=method))
    classifiers_list.append(CalibratedClassifierCV(ExtraTreesClassifier(class_weight=class_weight), cv=5, method=method))
    classifiers_list.append(CalibratedClassifierCV(GradientBoostingClassifier(), cv=5, method=method))
    classifiers_list.append(CalibratedClassifierCV(HistGradientBoostingClassifier(class_weight=class_weight), cv=5, method=method))
    classifiers_list.append(CalibratedClassifierCV(GaussianNB(), cv=5, method=method))
    
    classifiers_list.append(CalibratedClassifierCV(BernoulliNB(), cv=5, method=method))
    # classifiers_list.append(CalibratedClassifierCV(IsolationForest(), cv=5, method=method))
    # classifiers_list.append(CalibratedClassifierCV(ElasticNet(), cv=5, method=method))
    
    classifiers_list.append(CalibratedClassifierCV(LinearSVC(class_weight=class_weight), cv=5, method=method))
    classifiers_list.append(CalibratedClassifierCV(SGDClassifier(class_weight=class_weight), cv=5, method=method))
    classifiers_list.append(CalibratedClassifierCV(SVC(probability=True, class_weight=class_weight), cv=5, method=method))
    
    classifiers_list.append(CalibratedClassifierCV(AdaBoostClassifier(algorithm='SAMME', n_estimators=100), cv=5, method=method))
    classifiers_list.append(CalibratedClassifierCV(BaggingClassifier(estimator=SVC(), n_estimators=100, random_state=0), cv=5, method=method))
    return classifiers_list

In [None]:
def get_models_small(class_weight):
    classifiers_list = []
    classifiers_list.append(RandomForestClassifier(class_weight=class_weight))#class_weight
    classifiers_list.append(GradientBoostingClassifier())
    return classifiers_list

In [None]:
window_size = 10
pair = 'EURUSD'
X, y_high, y_low, y_close, returns = load_fx(data_start=0, data_end=6000, shift=3, window_size=window_size, pair=pair)

In [None]:
worm_up_period = 2
n_correct = 0  # Initialize correct predictions count
counter = 1
eval_counter = 0
predictions_list = []
real_list = []
classifiers_list = get_models()  # Assuming this function retrieves your models
percent_accuracy_list = []
returns_list = []
for idx in range(5000, 5999):
    
    X_wnd_trn = X[0:idx]
    X_wnd_tst = X[idx:idx+1]  # Testing on a forward step, not overlapping
    y_close_wnd_trn = y_close[0:idx]
    y_close_wnd_tst = y_close[idx:idx+1]  # Testing on a forward step, not overlapping
    y_close_tst = y_close_wnd_tst[-1]  # Last future value
    return_end = returns[idx]
    # Get window length
    arr_size = X_wnd_trn.shape[0]
    arr = np.arange(arr_size)
    n = int(arr_size * 0.99)  # 80% of data for training
    start_time = datetime.now()
    y_close_predict_list = []
    
    for classifier in classifiers_list:
        # Shuffle and train each classifier with different parts of data
        np.random.shuffle(arr)
        X_wnd_trn_part = X_wnd_trn[arr, :]
        y_close_wnd_trn_part = y_close_wnd_trn[arr]
        
        classifier.fit(X_wnd_trn_part, y_close_wnd_trn_part)
        y_close_predict_wnd = classifier.predict(X_wnd_tst)
        y_close_predict = y_close_predict_wnd[-1]  # Last prediction
        y_close_predict_list.append(y_close_predict)

    if counter > 1:
        # Start collecting predictions after the first iteration
        predictions_list.append(y_close_predict_list)  # Appending the new predictions
        real_list.append(y_close_tst)  # Now append the real value after the corresponding prediction
    
    if counter > worm_up_period:
        # Train the meta-model starting from the third iteration
        predictions_list_mx = np.array(predictions_list)  # Convert predictions list to numpy array
        real_list_mx = np.array(real_list)  # Convert real values list to numpy array
        
        # Debugging - Check the sizes of predictions_list_mx and real_list_mx
        # print(f"Iteration {counter}: Predictions list shape: {predictions_list_mx.shape}")
        # print(f"Iteration {counter}: Real list length: {len(real_list_mx)}")
        
        # Ensure shapes are consistent
        if predictions_list_mx.shape[0] == len(real_list_mx):
            # Train the meta-model
            meta = GaussianNB()
            X_recent = predictions_list_mx[:-1]
            y_recent = real_list_mx[:-1]
            meta.fit(X_recent, y_recent)  # Train on previous data
            # print(np.shape(predictions_list_mx[:-1]), np.shape(real_list_mx[:-1]))
            
            # Predict using the meta-model on the current predictions
            meta_prediction = meta.predict([y_close_predict_list])[0]  # Get the first (and only) prediction
            # meta_prediction = meta.predict(predictions_list_mx)[-1]
            # print(f"Meta Prediction 1: {meta_prediction}")
            # Buy/Sell
            if meta_prediction == 0:
                # sell return negative
                returns_list.append(-return_end)
            else:
                # buy return same
                returns_list.append(return_end)
            
            # Increment n_correct if the prediction matches the real class
            if meta_prediction == y_close_tst:
                n_correct += 1
            # Feature Selected Meta
            # X_recent_df = pd.DataFrame(X_recent)
            # y_recent_df = pd.Series(y_recent)
            # max_k = np.shape(X_recent)[1]
            # selected_features_list = mrmr_classif(X=X_recent_df, y=y_recent_df, K=max_k, show_progress=False)
            # if len(selected_features_list) > 0:
            #     selected_features = np.array(selected_features_list)
            #     
            #     print(np.shape(selected_features))
            #     X_recent_selected = X_recent[:,selected_features]
            #     print(X_recent_selected.shape)
            #     
            #     meta.fit(X_recent_selected, y_recent)  # Train on previous data
            # else:
            #     print('No features selected')
                
            eval_counter+=1    
        else:
            # Debugging - Print when the condition isn't met
            print(f"Skipping Meta Prediction at Iteration {counter}, inconsistent shapes")
    
    end_time = datetime.now()
    execution_time = end_time - start_time
    if counter > worm_up_period:
        print(f"Test Accuracy for step {eval_counter} is: {n_correct / eval_counter} ")
        percent_accuracy_list.append(n_correct / eval_counter)
    else:
        print(f'warming up:  {counter}')
    
    counter += 1  # Increment the counter

In [None]:
percent_accuracy_list_mx = np.array(percent_accuracy_list)
plt.plot(percent_accuracy_list_mx)

In [None]:
returns_list_mx = np.array(returns_list)
plt.bar(height= returns_list_mx, x = np.arange(np.shape(returns_list_mx)[0]))

In [None]:
# plot equity
plt.plot(np.cumsum(returns_list_mx))

In [None]:
from sklearn.linear_model import RidgeClassifierCV

c = RidgeClassifierCV(class_weight={})

In [None]:
data_end = 5000
shift = 3
X, y_high, y_low, y_close, returns = load_fx(data_start=0, data_end=data_end, shift=shift, window_size=window_size, pair=pair)
def one_train_set(X, y):
    # find indices of instances for both classes
    y0_idx = np.where(y == 0)[0]
    y1_idx = np.where(y == 1)[0]
    
    # Arrange array of indices for the class to be reduces here we reduce class 0
    arr_idx = np.arange(len(y0_idx))
    # shuffle indices for class 0
    np.random.shuffle(arr_idx)
    # take only half of instances with class 0
    arr_idx_half = arr_idx[0:int(len(y1_idx)*0.5)]
    
    # class 0 take half size
    y0 = y[arr_idx_half]
    X0 = X[arr_idx_half,:]
    # class 1 take full size
    y1 = y[y1_idx]
    X1 = X[y1_idx,:]
    
    # concatenate half zero and full one
    # now X and y have 3/4 of the Instances - 2/3 are ones and 1/3 is zero
    X1_major = np.concatenate((X0, X1), axis=0)
    y1_selected = np.concatenate((y0, y1))
    
    # create indices for reduces Set
    arr_idx_reduces = np.arange(len(y1_selected))
    
    # shuffle indices  Again Just in Case
    np.random.shuffle(arr_idx_reduces)
    
    # Apply shuffled indices
    X1_major = X1_major[arr_idx_reduces,:]
    y1_selected = y1_selected[arr_idx_reduces]
    
    # Convert to DataFrame for mrmr_classif 
    X1_major_df = pd.DataFrame(X1_major)
    y1_selected_df = pd.DataFrame(y1_selected)
    
    # Select K features. Return list.
    selected_features_list = mrmr_classif(X=X1_major_df, y=y1_selected_df, K=4, show_progress=False)
    # Convert list of selected indices to array
    selected_features_idx1 = np.array(selected_features_list)
    
    # now X1_selected have 50% less instances with zero class and features dedicated for class one
    X1_selected = X1_major[:,selected_features_idx1]
    return selected_features_idx1

selected_features_idx1 = one_train_set(X, y_close)
# Define the classifier with class weights
model0 = RandomForestClassifier(class_weight={0: 2, 1: 1})  # Assign higher weight to class 0
model1 = RandomForestClassifier(class_weight={0: 1, 1: 2})  # Assign higher weight to class 1


# Fit the model
# model1.fit(X[:,selected_features_idx1], y_close)
# model1.fit(X1_selected, y1_selected)
# model0.fit(X, y_close)
model1.fit(X, y_close)

In [None]:
data_end = 5000
shift = 2
window_size = 10
portion = 0.5

zero_weight, one_weight = {0: 2, 1: 1}, {0: 1, 1: 2}
# zero_weight, one_weight = {0: 1, 1: 1}, {0: 1, 1: 1}

#             0         1          2        3         4         5  
symbols = ['EURUSD', 'GBPUSD', 'AUDUSD', 'USDJPY', 'USDCAD', 'USDCHF']
model_list = []
model_dict = {}
for idx in range(len(symbols)):
    pair = symbols[idx]
    X, y_high, y_low, y_close, returns = load_fx(data_start=0, data_end=data_end, shift=shift, window_size=window_size, pair=pair)
    
    # classifiers_collection_0 = get_models(zero_weight)
    # classifiers_collection_1 = get_models(one_weight)
    
    classifiers_collection_0 = get_calibrated_models(zero_weight)
    classifiers_collection_1 = get_calibrated_models(one_weight)
    
    # classifiers_collection_0 = get_models_small(zero_weight)
    # classifiers_collection_1 = get_models_small(one_weight)


    LEN = len(y_close)
    arr_idx = np.arange(LEN)
    
    for model in classifiers_collection_0:
        np.random.shuffle(arr_idx)
        if hasattr(model, 'class_weight'):
            # Fit model with class_weight
            model.fit(X[arr_idx[:int(LEN*portion)],:], y_close[arr_idx[:int(LEN*portion)]])
        else:
            # Fit model with sample_weight since it doesn't support class_weight
            sample_weights0 = np.where(y_close[arr_idx[:int(LEN*portion)]] == 0, 2, 1)
            model.fit(X[arr_idx[:int(LEN*portion)],:], y_close[arr_idx[:int(LEN*portion)]], sample_weight=sample_weights0)
            
    for model in classifiers_collection_1:
        np.random.shuffle(arr_idx)
        if hasattr(model, 'class_weight'):
            # Fit model with class_weight
            model.fit(X[arr_idx[:int(LEN*portion)],:], y_close[arr_idx[:int(LEN*portion)]])
        else:
            # Define sample weights (e.g., higher weight for samples of class 1)
            sample_weights1 = np.where(y_close[arr_idx[:int(LEN*portion)]] == 1, 2, 1)
            model.fit(X[arr_idx[:int(LEN*portion)],:], y_close[arr_idx[:int(LEN*portion)]], sample_weight=sample_weights1)

    # Store models in dictionary
    model_dict[pair] = {'class_0': classifiers_collection_0, 'class_1': classifiers_collection_1}
chime.success() 
print('Train models done')

In [None]:
# Load data for all pairs at the start
pair_data, return_data, class_data = {}, {}, {}
for pair in symbols:
    X, y_high, y_low, y_close, returns = load_fx(data_start=0, data_end=6000, shift=shift, window_size=window_size, pair=pair)
    pair_data[pair] = X
    return_data[pair] = returns
    class_data[pair] = y_close
print('Load data done')    

In [None]:
start_day, end_day = 5000, 5990
buy_confidence_threshold = len(model_dict[pair]['class_1'])*0.95
sell_confidence_threshold = len(model_dict[pair]['class_0'])*0.05
buy_coount, sell_count = 0, 0
prob_sum_class_0_list, prob_sum_class_1_list = [],[]
profit = []
probs_zero = np.zeros([end_day - start_day ,len(symbols), len(model_dict[pair]['class_0'])])
probs_one = np.zeros([end_day - start_day ,len(symbols), len(model_dict[pair]['class_1'])])
# Assume that for each pair we have two collections of models: class_0 (sell) and class_1 (buy)
x_dim = 0
class_data_eval = np.zeros([end_day - start_day ,len(symbols)])
return_data_eval = np.zeros([end_day - start_day ,len(symbols)])
for day in range(start_day, end_day):
    # print(f"Evaluating models for day {day}")
    y_dim = 0
    # Loop through each currency pair
    for pair in model_dict:
        class_data_eval[x_dim][y_dim] = class_data[pair][day]
        return_data_eval[x_dim][y_dim] = return_data[pair][day]
        # Get data for the current day
        X_today = pair_data[pair][day].reshape(1, -1)  # Ensure X_today is 2D
        return_today = return_data[pair][day]
        # Sum the probabilities for Class 0 (sell) and Class 1 (buy)
        prob_sum_class_0 = 0  # Sum of sell probabilities
        prob_sum_class_1 = 0  # Sum of buy probabilities
        # prob_class_0_list = []
        z_zero_dim = 0
        # Evaluate models in class_0 (sell) ensemble
        for model in model_dict[pair]['class_0']:
            if hasattr(model, 'predict_proba'):
                prob_class_0 = model.predict_proba(X_today)[0,0] # Probability of Class 0 (sell) 
                prob_sum_class_0 += prob_class_0
                probs_zero[x_dim, y_dim, z_zero_dim] = prob_class_0
            else:
                class_prediction = model.predict(X_today)
                prob_sum_class_0 += class_prediction
                probs_zero[x_dim, y_dim, z_zero_dim] = class_prediction
            z_zero_dim += 1
            
        z_one_dim = 0
        # Evaluate models in class_1 (buy) ensemble
        for model in model_dict[pair]['class_1']:
            if hasattr(model, 'predict_proba'):
                prob_class_1 = model.predict_proba(X_today)[0,1] # Probability of Class 1 (buy)
                prob_sum_class_1 += prob_class_1
                probs_one[x_dim, y_dim, z_one_dim] = prob_class_1
            else:
                class_prediction = model.predict(X_today)
                prob_sum_class_1 += class_prediction
                probs_one[x_dim, y_dim, z_one_dim] = class_prediction
            z_one_dim += 1    
        
        prob_sum_class_0_list.append(prob_sum_class_0)
        prob_sum_class_1_list.append(prob_sum_class_1)
        
        
        # # Check if probabilities exceed the confidence threshold
        if prob_sum_class_0 > prob_sum_class_1:
            sell_count += 1
            profit.append(-return_today)
            #print(f"{pair}: SELL signal with confidence {prob_sum_class_0}") 
           
        if prob_sum_class_1 > prob_sum_class_0:
            buy_coount += 1
            profit.append(return_today)

        y_dim += 1
    x_dim += 1
prob_sum_class_0_list = np.array(prob_sum_class_0_list)
prob_sum_class_1_list = np.array(prob_sum_class_1_list)
profit = np.array(profit)
chime.success()
print('Inference done')

In [None]:
plt.plot(np.cumsum(profit), label='')
plt.plot(profit, label='')
plt.legend()
plt.show()

In [None]:
struct = {}
struct["probs_zero"] = probs_zero
struct["probs_one"] = probs_one
struct["class_data_eval"] = class_data_eval
struct["return_data_eval"] = return_data_eval
struct["symbols"] = symbols
import pickle

with open('probs.pickle', 'wb') as f:
    # Pickle the 'data' dictionary using the highest protocol available.
    pickle.dump(struct, f, pickle.HIGHEST_PROTOCOL)

In [None]:
import pickle
with open('probs.pickle', 'rb') as f:
    # The protocol version used is detected automatically, so we do not
    # have to specify it.
    struct = pickle.load(f)
probs_zero = struct["probs_zero"]
probs_one = struct["probs_one"] 
class_data_eval = struct["class_data_eval"] 
return_data_eval = struct["return_data_eval"] 
symbols = struct["symbols"]

In [None]:
plt.plot(prob_sum_class_0_list, label='Class 0')
plt.plot(prob_sum_class_1_list, label='Class 1')
plt.legend()
plt.show()

In [None]:
for x_idx in range(probs_one.shape[0]):
    print('---------')
    for y_idx in range(probs_one.shape[1]):
        formatted_a = ",".join([f"{x:.2f}" for x in probs_one[x_idx, y_idx, :]])
        formatted_b = ",".join([f"{x:.2f}" for x in probs_zero[x_idx, y_idx, :]])
        print(f"{symbols[y_idx]} p_one: [{formatted_a}], p_zero: [{formatted_b}] class: {class_data_eval[x_idx, y_idx]}")

In [None]:
print(f'probs_zero: {probs_zero.shape} | probs_one: {probs_one.shape} | Items: {probs_zero.shape[0]} | Pairs: {probs_zero.shape[1]} | Models: {probs_zero.shape[2]}')
print(f'class_data_eval: {class_data_eval.shape} | return_data: {return_data_eval.shape}')

In [None]:
from sklearn.feature_selection import f_classif, mutual_info_classif, chi2, SelectFromModel
f_statistic, p_values_f = f_classif(X,y_close)
chi2_statistic, p_values_chi2 = chi2(X,y_close)
mi = mutual_info_classif(X, y_close)
sm = SelectFromModel(estimator=RandomForestClassifier(n_estimators=100), threshold=0.5)
sm.fit(X, y_close)
sm.get_feature_names_out()

In [None]:
# reload more data
X, y_high, y_low, y_close, returns = load_fx(data_start=0, data_end=6400, shift=shift, window_size=window_size, pair=pair)
counter, one_count = 0, 0
profit = []
y_pred_list = []
y_pred_proba_list = []
for idx in range(data_end-10, len(y_high)-1):
    X_wnd = X[idx,:].reshape(1, -1)
    #X_wnd_selected = X_wnd[selected_features_idx1].reshape(1, -1)
    X_wnd_selected = X_wnd
    # y_pred = model1.predict(X_wnd_selected)
    
    # y_pred_list.append(y_pred)
    for models in model_list:
        model0, model1 = model[0], model[1]
        
        
        if hasattr(model1, 'predict_proba'):
            y_pred_proba0 = model0.predict_proba(X_wnd_selected)
            y_pred_proba1 = model1.predict_proba(X_wnd_selected)
            y_pred_proba_list.append(y_pred_proba1)
           
            if float(y_pred_proba1[:,1]) > 0.6:
                 # print("buy")
                 profit.append(returns[idx])
            if float(y_pred_proba0[:,0]) > 0.6:
                 # print("sell")
                 profit.append(-returns[idx])
        else:
            print("No probability")
            y_pred_0 = model0.predict(X_wnd_selected)
            y_pred_1 = model1.predict(X_wnd_selected)
            # y_pred_proba_list.append(y_pred_proba1)
           
            if y_pred_1 == 1:
                 profit.append(returns[idx])
            if y_pred_0 == 0:
                 profit.append(-returns[idx])

In [None]:
# print(f"Test Accuracy for step {counter} is: {counter / one_count}")  
y_pred_proba_list = np.array(y_pred_proba_list).squeeze()
y_pred_list = np.array(y_pred_list).squeeze()
profit = np.array(profit)
plt.plot(np.cumsum(profit))

In [None]:
class Foo():
    def __init__(self, X, y, max_k):
        super().__init__()
        self.X = X
        self.y = y
        self.max_k = max_k
        self.y0_idx= np.where(self.y==0)[0]
        self.y1_idx= np.where(self.y==1)[0]
        self.y0 = y_close[self.y0_idx] 
        self.y1 = y_close[self.y1_idx]
        self.X0 = X[self.y0_idx,:]
        self.X1 = X[self.y1_idx,:]
        self.X0_df = pd.DataFrame(self.X0)
        self.X1_df = pd.DataFrame(self.X1)
        self.y0_df = pd.Series(self.y0)
        self.y1_df = pd.Series(self.y1)

        selected_features_list0 = mrmr_classif(X=self.X0_df, y=self.y0_df, K=self.max_k, show_progress=False)
        selected_features0 = np.array(selected_features_list0)
        print(selected_features0.shape)
        self.X0_selected = self.X[:,selected_features0]
        
        selected_features_list1 = mrmr_classif(X=self.X1_df, y=self.y1_df, K=self.max_k, show_progress=False)
        selected_features1 = np.array(selected_features_list1)
        self.X1_selected = self.X[:,selected_features1]
    def get_zero(self):
        return self.X0_selected
    def get_one(self):
        return self.X1_selected
    
    

In [None]:
f = Foo(X, y_close, 5)
X0 = f.get_zero()