In [8]:
import matplotlib
matplotlib.use('TkAgg')
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import math
from new_datasets_py import create_subsets
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import RepeatedStratifiedKFold
from sklearn.metrics import balanced_accuracy_score
from sklearn.neural_network import MLPClassifier
from sklearn.metrics import (balanced_accuracy_score, precision_score, recall_score,
                             f1_score, roc_auc_score, log_loss)
from imblearn.ensemble import BalancedRandomForestClassifier


In [4]:
data = pd.read_csv('crypto-markets.csv')
filtered_data = data[data['ranknow'] < 30]
filtered_data.loc[:, 'date'] = pd.to_datetime(filtered_data['date'])
filtered_data.set_index('date', inplace=True)
datasets_with_labels = []

grouped = filtered_data.groupby('slug')

for crypto, group in grouped:
    close_values = group['close'].values

    for start in range(len(close_values) - 9):
        end = start + 10
        window = close_values[start:end]
        value_day_7 = window[6]  
        value_day_10 = window[9] 
        label = 1 if value_day_10 > value_day_7 else 0

        datasets_with_labels.append((window, label))

combined_table = pd.DataFrame(datasets_with_labels, columns=['close_values', 'label'])

# Print to check
print(combined_table)
missing_values = combined_table.isnull().sum()
print(missing_values)

                                            close_values  label
0      [0.695589, 0.742796, 0.86392, 0.734774, 1.07, ...      0
1      [0.742796, 0.86392, 0.734774, 1.07, 1.43, 1.33...      1
2      [0.86392, 0.734774, 1.07, 1.43, 1.33, 1.4, 1.4...      1
3      [0.734774, 1.07, 1.43, 1.33, 1.4, 1.4, 1.31, 2...      1
4      [1.07, 1.43, 1.33, 1.4, 1.4, 1.31, 2.38, 3.18,...      1
...                                                  ...    ...
21966  [0.125544, 0.122172, 0.119888, 0.117055, 0.100...      1
21967  [0.122172, 0.119888, 0.117055, 0.100511, 0.112...      1
21968  [0.119888, 0.117055, 0.100511, 0.112595, 0.111...      1
21969  [0.117055, 0.100511, 0.112595, 0.11143, 0.1131...      1
21970  [0.100511, 0.112595, 0.11143, 0.113184, 0.1123...      1

[21971 rows x 2 columns]
close_values    0
label           0
dtype: int64


  return Index(sequences[0], name=names)


In [5]:
# Checking balance of the dataset
print(combined_table['label'].value_counts())
combined_table['label'].value_counts().plot(kind='bar')
plt.title('Class Distribution')
plt.xlabel('Class')
plt.ylabel('Frequency')
plt.show()
class_counts = combined_table['label'].value_counts()
imbalance_ratio = class_counts.min() / class_counts.max()
print(f"Imbalance Ratio: {imbalance_ratio}")

label
0    11709
1    10262
Name: count, dtype: int64
Imbalance Ratio: 0.8764198479801861


In [6]:
X = []
y = []
X = combined_table['close_values'].apply(lambda x: x[:7]).tolist()
y = combined_table['label'].astype(int).tolist()
X = np.array(X)
y = np.array(y)


In [7]:
classifiers = {
    "Multi-Layer Perceptron": MLPClassifier(),
    "Decision Tree": DecisionTreeClassifier(),
    "Random Forest": RandomForestClassifier(),
    "Balanced Random Forest": BalancedRandomForestClassifier(replacement=True, sampling_strategy='all', random_state=42, bootstrap=False),
    "Logistic Regression": LogisticRegression(class_weight='balanced', max_iter=1000),
    "Support Vector Machine": SVC(),
    "K-Nearest Neighbors": KNeighborsClassifier()
}

rskf = RepeatedStratifiedKFold(n_splits=5, n_repeats=2, random_state=42)
results = {clf_name: {'balanced_accuracy': [], 'precision': [], 'recall': [], 'f1': [], 'roc_auc': [], 'log_loss': []} for clf_name in classifiers}

# Evaluation loop
for clf_name, clf in classifiers.items():
    for train, test in rskf.split(X, y):
        model = clf
        model.fit(X[train], y[train])
        y_pred = model.predict(X[test])
        y_prob = model.predict_proba(X[test]) if hasattr(model, "predict_proba") else None

        results[clf_name]['balanced_accuracy'].append(balanced_accuracy_score(y[test], y_pred))
        results[clf_name]['precision'].append(precision_score(y[test], y_pred))
        results[clf_name]['recall'].append(recall_score(y[test], y_pred))
        results[clf_name]['f1'].append(f1_score(y[test], y_pred))
        if y_prob is not None:
            results[clf_name]['roc_auc'].append(roc_auc_score(y[test], y_prob[:, 1]))
            results[clf_name]['log_loss'].append(log_loss(y[test], y_prob))
        else:
            # For classifiers without predict_proba, handle ROC AUC and Log Loss differently
            results[clf_name]['roc_auc'].append(np.nan)
            results[clf_name]['log_loss'].append(np.nan)


print("Mean Performance Metrics:")
for metric in ['balanced_accuracy', 'precision', 'recall', 'f1', 'roc_auc', 'log_loss']:
    print(f"{metric.capitalize()}:")
    for clf_name, scores in results.items():
        mean_score = np.nanmean(scores[metric])  
        std_score = np.nanstd(scores[metric])    
        print(f"   {clf_name}: Mean = {mean_score:.4f}, Std = {std_score:.4f}")
    print()


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


KeyboardInterrupt: 

In [None]:
# Visualization of results
fig, axs = plt.subplots(3, 2, figsize=(15, 15))
metrics = ['balanced_accuracy', 'precision', 'recall', 'f1', 'roc_auc', 'log_loss']
titles = ['Balanced Accuracy', 'Precision', 'Recall', 'F1 Score', 'ROC AUC', 'Log Loss']

for ax, metric, title in zip(axs.ravel(), metrics, titles):
    ax.boxplot([results[clf][metric] for clf in classifiers], labels=classifiers.keys())
    ax.set_title(title)
    ax.set_xlabel('Classifiers')
    ax.set_ylabel(title)
    ax.set_xticklabels(classifiers.keys(), rotation=45, ha='right')

plt.tight_layout()
plt.show()

In [None]:
# Summary of results
for clf_name, metrics in results.items():
    print(f"{clf_name} Performance Metrics:")
    for metric, scores in metrics.items():
        mean_score = np.nanmean(scores)  # Use nanmean to handle NaN values for some metrics
        std_score = np.nanstd(scores)    # Use nanstd to handle NaN values for some metrics
        print(f"   {metric.capitalize()}: Mean = {mean_score:.4f}, Std = {std_score:.4f}")