In [133]:
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import KBinsDiscretizer
from sklearn.ensemble import RandomForestClassifier
from sklearn.feature_selection import f_classif, mutual_info_classif, chi2, SelectFromModel
from KL.kl.utils import load_fx
import numpy as np
window_size = 10
pair = 'EURUSD'
shift = 2
X, y_high, y_low, y_close, returns = load_fx(data_start=0, data_end=5000, shift=shift, window_size=window_size, pair=pair)

def get_top(score, N=4):
    # Get the indices of the top N features with the highest f_scores
    #N = 4  # Select the top N features
    top_n_indices = np.argsort(score)[-N:]  # Get the indices of the N largest values
    
    # Reverse the order so the top feature is first
    top_n_indices = top_n_indices[::-1]
    
    # print("Indices of top N ANOVA features:", top_n_indices)
    return top_n_indices
# 1. --------------------
f_statistic, p_values_f = f_classif(X,y_close)
anova_top_n_indices = get_top(f_statistic)
print("Indices of top N ANOVA features:", anova_top_n_indices)

# 2. -------------------
# Discretize features into 3 bins
discretizer = KBinsDiscretizer(n_bins=3, encode='ordinal', strategy='uniform')
X_discretized = discretizer.fit_transform(X)
chi2_statistic, p_values_chi2 = chi2(X_discretized,y_close)
chi2_top_n_indices = get_top(chi2_statistic)
print("Indices of top N CHI2 features:", chi2_top_n_indices)
# 3. -----------------------
mi_scores = mutual_info_classif(X, y_close)
mrmr_top_n_indices = get_top(mi_scores)
print("Indices of top N MRMR features:", mrmr_top_n_indices)
# 4. -------------------
sfm = SelectFromModel(estimator=RandomForestClassifier(n_estimators=100), max_features=4)
sfm.fit(X, y_close)
# feature_importances = sfm.estimator_.feature_importances_

model_top_n_indices = sfm.get_support(indices=True)

print(f"Indices of selected features: {model_top_n_indices}")

Indices of top N ANOVA features: [7 3 0 6]
Indices of top N CHI2 features: [6 0 7 2]
Indices of top N MRMR features: [7 1 3 6]
Indices of selected features: [1 2 7]


In [134]:
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import LinearSVC, SVC
from sklearn.linear_model import ElasticNet, SGDClassifier
from sklearn.naive_bayes import GaussianNB, BernoulliNB
from sklearn.ensemble import ExtraTreesClassifier, GradientBoostingClassifier, HistGradientBoostingClassifier, \
    IsolationForest, BaggingClassifier


def get_models(class_weight):
    classifiers_list = []
    classifiers_list.append(RandomForestClassifier(class_weight=class_weight))#class_weight
    classifiers_list.append(ExtraTreesClassifier(class_weight=class_weight))#class_weight
    classifiers_list.append(GradientBoostingClassifier())
    classifiers_list.append(HistGradientBoostingClassifier(class_weight=class_weight))#class_weight
    classifiers_list.append(GaussianNB())
    classifiers_list.append(BernoulliNB())
    # classifiers_list.append(IsolationForest())
    # classifiers_list.append(ElasticNet())
    classifiers_list.append(KNeighborsClassifier()) 
    classifiers_list.append(LinearSVC(class_weight=class_weight))#class_weight/no predict_proba
    classifiers_list.append(SGDClassifier(class_weight=class_weight))#class_weight/no predict_proba
    classifiers_list.append(SVC(probability=True, class_weight=class_weight))#class_weight/ no predict_proba
    classifiers_list.append(AdaBoostClassifier(algorithm='SAMME', n_estimators=100))
    classifiers_list.append(BaggingClassifier(estimator=SVC(), n_estimators=100, random_state=0))
    return classifiers_list

In [140]:
from sklearn.ensemble import AdaBoostClassifier

X_new_, _, _, y_close_new_, returns = load_fx(data_start=0, data_end=6000, shift=shift, window_size=window_size, pair=pair)
X_new = X_new_[5000:6000, :]
y_new = y_close_new_[5000:6000]
return_new = returns[5000:6000]

indices = [anova_top_n_indices, model_top_n_indices, mrmr_top_n_indices, chi2_top_n_indices]

weights = {0: 1, 1: 1} 
models = get_models(weights)

pred_list = []
Len = len(y_close)
arr_idx = np.arange(Len)
for idx in range(len(indices)):
    # print(indices[idx])
    for model in models:
        np.random.shuffle(arr_idx)
        part_idx = arr_idx[0:int(Len*0.75)]
        
        model.fit(X[part_idx, indices[idx]], y_close[part_idx])
        pred = model.predict(X_new[:, indices[idx]])
        pred_list.append(pred)
pred_list = np.array(pred_list).T

(9, 4)


IndexError: shape mismatch: indexing arrays could not be broadcast together with shapes (3742,) (4,) 

In [None]:
# Calculate pairwise disagreement between models
def disagreement_rate(pred_list):
    n_models = pred_list.shape[1]
    disagreement_matrix = np.zeros((n_models, n_models))

    for i in range(n_models):
        for j in range(i + 1, n_models):
            disagreement = np.mean(pred_list[:, i] != pred_list[:, j])
            disagreement_matrix[i, j] = disagreement
            disagreement_matrix[j, i] = disagreement  # Symmetric matrix

    return disagreement_matrix

	2.	Diversity of Models:
	•	A moderate disagreement rate (in the 30-40% range) suggests that the models are reasonably diverse, meaning they are capturing slightly different patterns in the data based on the different feature selection methods.
	•	The disagreement rates are not extremely low (close to 0), which means the models are not making identical predictions.
	•	Similarly, they are not extremely high (close to 1), which would indicate that the models are completely contradictory, which would not be ideal in an ensemble.

In [None]:
# Calculate the disagreement matrix
disagreement_matrix = disagreement_rate(pred_list)
print("Disagreement matrix between models:")
for row in disagreement_matrix:
    formatted_row = ",".join([f"{value:.4f}" for value in row])
    print(formatted_row)

In [None]:
# Majority Voting: Aggregate the predictions
majority_vote = np.mean(pred_list, axis=1) > 0.5
accuracy = np.mean(majority_vote == y_new)

print(f"Majority voting accuracy: {accuracy:.4f}")

# Individual model accuracies
for idx, pred in enumerate(pred_list.T):
    model_accuracy = np.mean(pred == y_new)
    print(f"Model {idx + 1} accuracy: {model_accuracy:.4f}")

In [None]:
profit = []
for idx in range(len(return_new)):
    if not majority_vote[idx]:
        profit.append(return_new[idx])
    else:
        profit.append(-return_new[idx])
profit = np.array(profit)
import matplotlib.pyplot as plt
fig, ax = plt.subplots()
ax.plot(np.cumsum(profit))
ax.set_title('Profit')
plt.show()

In [None]:
import pickle
with open('probs.pickle', 'rb') as f:
    # The protocol version used is detected automatically, so we do not
    # have to specify it.
    struct = pickle.load(f)
probs_zero = struct["probs_zero"]
probs_one = struct["probs_one"] 
class_data_eval = struct["class_data_eval"] 
return_data_eval = struct["return_data_eval"] 
symbols = struct["symbols"]

In [None]:
for x_idx in range(probs_one.shape[0]):
    print('---------')
    for y_idx in range(probs_one.shape[1]):
        formatted_a = ",".join([f"{x:.2f}" for x in probs_one[x_idx, y_idx, :]])
        formatted_b = ",".join([f"{x:.2f}" for x in probs_zero[x_idx, y_idx, :]])
        print(f"{symbols[y_idx]} p_one: [{formatted_a}], p_zero: [{formatted_b}] class: {class_data_eval[x_idx, y_idx]}")