# Libraries

In [2]:
import matplotlib
matplotlib.use('TkAgg')
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import math
from new_datasets_py import create_subsets
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import RepeatedStratifiedKFold
from sklearn.metrics import balanced_accuracy_score
from sklearn.neural_network import MLPClassifier
from sklearn.metrics import (balanced_accuracy_score, precision_score, recall_score,
                             f1_score, roc_auc_score, log_loss)
from imblearn.ensemble import BalancedRandomForestClassifier


In [3]:
import pandas as pd

# Load data
data = pd.read_csv('crypto-markets.csv')

# Filter data for cryptocurrencies with ranknow < 30
filtered_data = data[data['ranknow'] < 30]

# Convert the date column to datetime and set it as the index
filtered_data.loc[:, 'date'] = pd.to_datetime(filtered_data['date'])
filtered_data.set_index('date', inplace=True)

# Initialize a dictionary to hold the separate datasets
datasets_dict = {}

# Group the filtered data by the 'slug' column
grouped = filtered_data.groupby('slug')

# Iterate over each group
for crypto, group in grouped:
    datasets_with_labels = []
    close_values = group['close'].values

    for start in range(len(close_values) - 9):
        end = start + 10
        window = close_values[start:end]
        value_day_7 = window[6]
        value_day_10 = window[9]
        label = 1 if value_day_10 > value_day_7 else 0

        datasets_with_labels.append((window, label))
    
    # Convert the list of tuples into a DataFrame
    dataset = pd.DataFrame(datasets_with_labels, columns=['close_values', 'label'])
    
    # Store the DataFrame in the dictionary
    datasets_dict[crypto] = dataset

# Now datasets_dict contains 30 DataFrames, one for each cryptocurrency
# You can access each dataset like this: datasets_dict['bitcoin']

# Print to check one of the datasets
print(datasets_dict['aeternity'])

# Print missing values for each dataset
for crypto, df in datasets_dict.items():
    missing_values = df.isnull().sum()
    print(f'Missing values in {crypto} dataset:\n{missing_values}\n')


                                          close_values  label
0    [0.695589, 0.742796, 0.86392, 0.734774, 1.07, ...      0
1    [0.742796, 0.86392, 0.734774, 1.07, 1.43, 1.33...      1
2    [0.86392, 0.734774, 1.07, 1.43, 1.33, 1.4, 1.4...      1
3    [0.734774, 1.07, 1.43, 1.33, 1.4, 1.4, 1.31, 2...      1
4    [1.07, 1.43, 1.33, 1.4, 1.4, 1.31, 2.38, 3.18,...      1
..                                                 ...    ...
357  [3.28, 3.19, 3.19, 3.2, 2.98, 3.18, 3.12, 3.15...      1
358  [3.19, 3.19, 3.2, 2.98, 3.18, 3.12, 3.15, 3.13...      1
359  [3.19, 3.2, 2.98, 3.18, 3.12, 3.15, 3.13, 3.63...      1
360  [3.2, 2.98, 3.18, 3.12, 3.15, 3.13, 3.63, 3.53...      0
361  [2.98, 3.18, 3.12, 3.15, 3.13, 3.63, 3.53, 3.2...      1

[362 rows x 2 columns]
Missing values in aeternity dataset:
close_values    0
label           0
dtype: int64

Missing values in binance-coin dataset:
close_values    0
label           0
dtype: int64

Missing values in bitcoin dataset:
close_values    0
la

  return Index(sequences[0], name=names)


In [4]:
import matplotlib.pyplot as plt
import math

# Function to check balance and plot distribution for each dataset
def check_balance_and_plot(datasets_dict):
    num_datasets = len(datasets_dict)
    cols = 5  # Number of columns for subplots
    rows = math.ceil(num_datasets / cols)  # Number of rows for subplots

    fig, axes = plt.subplots(rows, cols, figsize=(20, rows * 4))
    axes = axes.flatten()  # Flatten the 2D array of axes for easy iteration

    for idx, (crypto, df) in enumerate(datasets_dict.items()):
        # Checking balance of the dataset
        class_counts = df['label'].value_counts()
        print(f'Class distribution for {crypto}:')
        print(class_counts)

        # Plotting the class distribution
        class_counts.plot(kind='bar', ax=axes[idx])
        axes[idx].set_title(f'Class Distribution for {crypto}')
        axes[idx].set_xlabel('Class')
        axes[idx].set_ylabel('Frequency')

        # Calculating imbalance ratio
        imbalance_ratio = class_counts.min() / class_counts.max()
        print(f"Imbalance Ratio for {crypto}: {imbalance_ratio}\n")

    # Remove any unused subplots
    for ax in axes[num_datasets:]:
        fig.delaxes(ax)

    plt.tight_layout()
    plt.show()

# Call the function to check balance and plot for each dataset
check_balance_and_plot(datasets_dict)

Class distribution for aeternity:
label
0    186
1    176
Name: count, dtype: int64
Imbalance Ratio for aeternity: 0.946236559139785

Class distribution for binance-coin:
label
1    167
0    141
Name: count, dtype: int64
Imbalance Ratio for binance-coin: 0.844311377245509

Class distribution for bitcoin:
label
1    1039
0     818
Name: count, dtype: int64
Imbalance Ratio for bitcoin: 0.7872954764196343

Class distribution for bitcoin-cash:
label
0    163
1    147
Name: count, dtype: int64
Imbalance Ratio for bitcoin-cash: 0.901840490797546

Class distribution for bitcoin-gold:
label
0    126
1     92
Name: count, dtype: int64
Imbalance Ratio for bitcoin-gold: 0.7301587301587301

Class distribution for bytecoin-bcn:
label
0    827
1    614
Name: count, dtype: int64
Imbalance Ratio for bytecoin-bcn: 0.7424425634824667

Class distribution for cardano:
label
1    121
0    119
Name: count, dtype: int64
Imbalance Ratio for cardano: 0.9834710743801653

Class distribution for dash:
label
0    

In [5]:
import pandas as pd
import numpy as np
from sklearn.neural_network import MLPClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from imblearn.ensemble import BalancedRandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import RepeatedStratifiedKFold
from sklearn.metrics import balanced_accuracy_score, precision_score, recall_score, f1_score, roc_auc_score, log_loss

# Assuming datasets_dict is already created
results = []

classifiers = {
    "Multi-Layer Perceptron": MLPClassifier(),
    "Decision Tree": DecisionTreeClassifier(),
    "Random Forest": RandomForestClassifier(),
    "Balanced Random Forest": BalancedRandomForestClassifier(replacement=True, sampling_strategy='all', random_state=42, bootstrap=False),
    "Logistic Regression": LogisticRegression(class_weight='balanced', max_iter=1000),
    "Support Vector Machine": SVC(probability=True),
    "K-Nearest Neighbors": KNeighborsClassifier()
}

rskf = RepeatedStratifiedKFold(n_splits=5, n_repeats=2, random_state=42)

for crypto, df in datasets_dict.items():
    X = np.array(df['close_values'].apply(lambda x: x[:7]).tolist())
    y = df['label'].astype(int).values

    for clf_name, clf in classifiers.items():
        metrics = {
            'balanced_accuracy': [],
            'precision': [],
            'recall': [],
            'f1': [],
            'roc_auc': [],
            'log_loss': []
        }
        for train_index, test_index in rskf.split(X, y):
            model = clf
            model.fit(X[train_index], y[train_index])
            y_pred = model.predict(X[test_index])
            y_prob = model.predict_proba(X[test_index]) if hasattr(model, "predict_proba") else None

            metrics['balanced_accuracy'].append(balanced_accuracy_score(y[test_index], y_pred))
            metrics['precision'].append(precision_score(y[test_index], y_pred))
            metrics['recall'].append(recall_score(y[test_index], y_pred))
            metrics['f1'].append(f1_score(y[test_index], y_pred))
            if y_prob is not None:
                metrics['roc_auc'].append(roc_auc_score(y[test_index], y_prob[:, 1]))
                metrics['log_loss'].append(log_loss(y[test_index], y_prob))
            else:
                metrics['roc_auc'].append(np.nan)
                metrics['log_loss'].append(np.nan)

        mean_metrics = {metric: np.nanmean(values) for metric, values in metrics.items()}
        std_balanced_accuracy = np.nanstd(metrics['balanced_accuracy'])

        results.append({
            'crypto': crypto,
            'classifier': clf_name,
            **mean_metrics,
            'balanced_accuracy_std': std_balanced_accuracy
        })

# Konwersja wyników do DataFrame
results_df = pd.DataFrame(results)

# Pivot the DataFrame to get the desired format
pivot_df = results_df.pivot_table(index='crypto', columns='classifier', values=[
    'balanced_accuracy', 'balanced_accuracy_std',
    'precision', 'recall', 'f1', 'roc_auc', 'log_loss'
])

# Flatten the multi-index columns
pivot_df.columns = ['_'.join(col).strip() for col in pivot_df.columns.values]

# Save the pivoted DataFrame to a CSV file
pivot_df.to_csv('pivoted_classification_results.csv', index=True)


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize

In [6]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np

# Wczytanie danych z pliku CSV
results_df = pd.read_csv('classification_results.csv')

# Filtrowanie danych dla Bitcoina
bitcoin_results = results_df[results_df['crypto'] == 'bitcoin']
bitcoin_results = bitcoin_results[bitcoin_results['classifier'] == 'Decision Tree']

# Wybór metryk do wyświetlenia
metrics = ['balanced_accuracy', 'precision', 'recall', 'f1', 'roc_auc']

# Przygotowanie danych do wykresu
classifier_names = bitcoin_results['classifier']
metric_values = bitcoin_results[metrics]

# Utworzenie wykresu słupkowego
fig, ax = plt.subplots(figsize=(12, 8))

# Tworzenie wykresu dla każdej metryki
width = 0.15  # Szerokość słupków
x = np.arange(len(classifier_names))

for i, metric in enumerate(metrics):
    ax.bar(x + i * width, bitcoin_results[metric], width, label=metric)

# Dodanie etykiet i tytułu
ax.set_xlabel('Klasyfikator')
ax.set_ylabel('Wartość Metryki')
ax.set_title('Wyniki klasyfikacji dla Bitcoina')
ax.set_xticks(x + width * (len(metrics) - 1) / 2)
ax.set_xticklabels(classifier_names)
ax.legend()

# Obrócenie etykiet na osi X dla lepszej czytelności
plt.xticks(rotation=45)
plt.tight_layout()

# Wyświetlenie wykresu
plt.show()


In [7]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np

# Wczytanie danych z pliku CSV
results_df = pd.read_csv('classification_results.csv')

# Filtrowanie danych dla Bitcoina
bitcoin_results = results_df[results_df['crypto'] == 'bitcoin']
bitcoin_results = bitcoin_results[bitcoin_results['classifier'] == 'Decision Tree']

# Wybór metryk do wyświetlenia
metrics = ['log_loss']

# Przygotowanie danych do wykresu
classifier_names = bitcoin_results['classifier']
metric_values = bitcoin_results[metrics]

# Utworzenie wykresu słupkowego
fig, ax = plt.subplots(figsize=(12, 8))

# Tworzenie wykresu dla każdej metryki
width = 0.15  # Szerokość słupków
x = np.arange(len(classifier_names))

for i, metric in enumerate(metrics):
    ax.bar(x + i * width, bitcoin_results[metric], width, label=metric)

# Dodanie etykiet i tytułu
ax.set_xlabel('Klasyfikator')
ax.set_ylabel('Wartość Metryki')
ax.set_title('Wyniki klasyfikacji dla Bitcoina')
ax.set_xticks(x + width * (len(metrics) - 1) / 2)
ax.set_xticklabels(classifier_names)
ax.legend()

# Obrócenie etykiet na osi X dla lepszej czytelności
plt.xticks(rotation=45)
plt.tight_layout()

# Wyświetlenie wykresu
plt.show()


invalid command name "6249734976process_stream_events"
    while executing
"6249734976process_stream_events"
    ("after" script)


In [8]:
# Load the results from the CSV file
loaded_results_df = pd.read_csv('classification_results.csv')

# Print the loaded results
print(loaded_results_df)

# Optionally, print the mean performance metrics again
mean_loaded_results = loaded_results_df.groupby(['crypto', 'classifier'])[metrics].mean().reset_index()
print("Mean Performance Metrics (Loaded from CSV):")
for metric in metrics:
    print(f"{metric.capitalize()}:")
    for _, row in mean_loaded_results.iterrows():
        print(f"   {row['crypto']} - {row['classifier']}: Mean = {row[metric]:.4f}")
    print()


         crypto              classifier  balanced_accuracy  precision  \
0     aeternity  Multi-Layer Perceptron           0.537970   0.511628   
1     aeternity  Multi-Layer Perceptron           0.589715   0.575000   
2     aeternity  Multi-Layer Perceptron           0.542471   0.526316   
3     aeternity  Multi-Layer Perceptron           0.558301   0.534884   
4     aeternity  Multi-Layer Perceptron           0.583012   0.571429   
...         ...                     ...                ...        ...   
2025    zilliqa     K-Nearest Neighbors           0.717949   0.727273   
2026    zilliqa     K-Nearest Neighbors           0.599359   0.615385   
2027    zilliqa     K-Nearest Neighbors           0.644231   0.700000   
2028    zilliqa     K-Nearest Neighbors           0.647436   0.750000   
2029    zilliqa     K-Nearest Neighbors           0.541667   0.533333   

        recall        f1   roc_auc  log_loss  
0     0.628571  0.564103  0.524060  0.693111  
1     0.638889  0.605263  0.5

In [9]:
results_df = pd.read_csv('classification_results.csv')

# Filtrowanie danych dla Bitcoina
bitcoin_results = results_df[results_df['crypto'] == 'bitcoin']

# Wybór metryk do wyświetlenia
metrics = ['balanced_accuracy', 'precision', 'recall', 'f1', 'roc_auc', 'log_loss']

# Przygotowanie danych do wykresu
classifier_names = bitcoin_results['classifier']
metric_values = bitcoin_results[metrics]

# Utworzenie wykresu słupkowego
fig, ax = plt.subplots(figsize=(12, 8))

# Tworzenie wykresu dla każdej metryki
width = 0.15  # Szerokość słupków
x = np.arange(len(classifier_names))

for i, metric in enumerate(metrics):
    ax.bar(x + i * width, bitcoin_results[metric], width, label=metric)

# Dodanie etykiet i tytułu
ax.set_xlabel('Klasyfikator')
ax.set_ylabel('Wartość Metryki')
ax.set_title('Wyniki klasyfikacji dla Bitcoina')
ax.set_xticks(x + width * (len(metrics) - 1) / 2)
ax.set_xticklabels(classifier_names)
ax.legend()

# Obrócenie etykiet na osi X dla lepszej czytelności
plt.xticks(rotation=45)
plt.tight_layout()

# Wyświetlenie wykresu
plt.show()

In [10]:
import pandas as pd
import numpy as np
from scipy import stats
from itertools import combinations
import matplotlib.pyplot as plt
import seaborn as sns

# Wczytaj dane z pliku CSV
# file_path = 'path_to_your_file.csv'
data = pd.read_csv('pivoted_classification_results.csv')

# Przefiltruj kolumny tylko dla `balanced_accuracy`
balanced_acc_cols = [col for col in data.columns if 'balanced_accuracy' in col]
data_bal_acc = data[['crypto'] + balanced_acc_cols]

# Skrócone nazwy kolumn
short_col_names = {
    'balanced_accuracy_Balanced Random Forest': 'BRF',
    'balanced_accuracy_Decision Tree': 'DT',
    'balanced_accuracy_K-Nearest Neighbors': 'KNN',
    'balanced_accuracy_Logistic Regression': 'LR',
    'balanced_accuracy_Multi-Layer Perceptron': 'MLP',
    'balanced_accuracy_Random Forest': 'RF',
    'balanced_accuracy_Support Vector Machine': 'SVM'
}
print(data_bal_acc)


              crypto  balanced_accuracy_Balanced Random Forest  \
0          aeternity                                  0.544775   
1       binance-coin                                  0.671891   
2            bitcoin                                  0.628519   
3       bitcoin-cash                                  0.635970   
4       bitcoin-gold                                  0.643893   
5       bytecoin-bcn                                  0.570540   
6            cardano                                  0.649877   
7               dash                                  0.596228   
8             decred                                  0.568222   
9                eos                                  0.634528   
10          ethereum                                  0.625721   
11  ethereum-classic                                  0.614757   
12              icon                                  0.563745   
13              iota                                  0.616622   
14        

In [18]:
# Przefiltruj kolumny tylko dla `balanced_accuracy`
from scipy.stats import t
balanced_acc_cols = [col for col in data.columns if 'balanced_accuracy' in col]
data_bal_acc = data[['crypto'] + balanced_acc_cols]

# Skrócone nazwy kolumn
short_col_names = {
    'balanced_accuracy_Balanced Random Forest': 'BRF',
    'balanced_accuracy_Decision Tree': 'DT',
    'balanced_accuracy_K-Nearest Neighbors': 'KNN',
    'balanced_accuracy_Logistic Regression': 'LR',
    'balanced_accuracy_Multi-Layer Perceptron': 'MLP',
    'balanced_accuracy_Random Forest': 'RF',
    'balanced_accuracy_Support Vector Machine': 'SVM',
    'balanced_accuracy_std_Balanced Random Forest': 'BRF_std',
    'balanced_accuracy_std_Decision Tree': 'DT_std',
    'balanced_accuracy_std_K-Nearest Neighbors': 'KNN_std',
    'balanced_accuracy_std_Logistic Regression': 'LR_std',
    'balanced_accuracy_std_Multi-Layer Perceptron': 'MLP_std',
    'balanced_accuracy_std_Random Forest': 'RF_std',
    'balanced_accuracy_std_Support Vector Machine': 'SVM_std'
}

# Zamień nazwy kolumn na skrócone
data_bal_acc = data_bal_acc.rename(columns=short_col_names)

# Wybierz dane dla pierwszej kryptowaluty (aeternity)
crypto_data = data_bal_acc #[data_bal_acc['crypto'] == 'aeternity']

# Lista klasyfikatorów
classifiers = ['BRF', 'DT', 'KNN', 'LR', 'MLP', 'RF', 'SVM']

# Oblicz test t-studenta dla każdej pary klasyfikatorów
results = []

cryptocurrencies = data_bal_acc['crypto'].unique()

for crypto in cryptocurrencies: 
    for i in range(len(classifiers)):
        for j in range(len(classifiers)):
            clf1 = classifiers[i]
            clf2 = classifiers[j]
            
            mean1 = crypto_data[f'{clf1}'].values[0]
            mean2 = crypto_data[f'{clf2}'].values[0]
            
            std1 = crypto_data[f'{clf1}_std'].values[0]
            std2 = crypto_data[f'{clf2}_std'].values[0]
            
            n1 = n2 = 361  # Zakładam, że mamy 30 próbek dla każdej metryki

            # Oblicz statystykę t
            t_stat = (mean1 - mean2) / np.sqrt((std1**2 / n1) + (std2**2 / n2))
            
            # Oblicz stopnie swobody
            df = ((std1**2 / n1) - (std2**2 / n2))**2 / (((std1**2 / n1)**2 / (n1 - 1)) + ((std2**2 / n2)**2 / (n2 - 1)))
            
            # Oblicz p-value
            p_value = 2 * (1 - t.cdf(np.abs(t_stat), df))
            
            # Dodaj wynik do listy tylko jeśli p-value > 0.05
            results.append({
                'crypto': crypto,
                'Classifier 1': clf1,
                'Classifier 2': clf2,
                't-statistic': t_stat,
                'p-value': p_value
            })
            
                
            # n1 = n2 = 30  # Zakładam, że mamy 30 próbek dla każdej metryki

            # # Oblicz statystykę t
            # t_stat = (mean1 - mean2) / np.sqrt((std1**2 / n1) + (std2**2 / n2))
            
            # # Dodaj wynik do listy
            # if t_stat > 2:
            #     results.append({
            #         'crypto': crypto,
            #         'Classifier 1': clf1,
            #         'Classifier 2': clf2,
            #         't-statistic': t_stat
            #     })


# Konwertuj wyniki do DataFrame i wyświetl
results_df = pd.DataFrame(results)
print(results_df)

         crypto Classifier 1 Classifier 2  t-statistic       p-value
0     aeternity          BRF          BRF     0.000000           NaN
1     aeternity          BRF           DT     5.132105  4.651283e-06
2     aeternity          BRF          KNN   -14.826851  2.797731e-01
3     aeternity          BRF           LR    -5.732062  6.933376e-08
4     aeternity          BRF          MLP    -4.087927  3.044621e-04
...         ...          ...          ...          ...           ...
1416    zilliqa          SVM          KNN   -11.739675  1.626577e-02
1417    zilliqa          SVM           LR    -1.654362  1.011430e-01
1418    zilliqa          SVM          MLP    -0.321350  7.526688e-01
1419    zilliqa          SVM           RF     3.117379  2.464814e-03
1420    zilliqa          SVM          SVM     0.000000           NaN

[1421 rows x 5 columns]


In [19]:
results_df.to_csv('t_test_results.csv', index=False)

In [13]:
# import pandas as pd

# # Load your CSV file
# file_path = 'path/to/your/t_test_results.csv'
# data = pd.read_csv(file_path)

# # Extract unique classifiers
# classifiers_1 = data['classifier1'].unique()
# classifiers_2 = data['classifier2'].unique()

# # Split data into three groups
# cryptos = data['cryptocurrency'].unique()
# cryptos_split = [cryptos[i:i + 9] for i in range(0, len(cryptos), 9)]

# # Function to generate LaTeX table for a group of cryptocurrencies
# def generate_latex_table(group, classifiers_1, classifiers_2):
#     latex = "\\begin{table}[H]\n\\centering\n\\begin{tabular}{|" + "c|" * (len(classifiers_2) + 1) + "}\n"
#     latex += "\\hline\n"
#     latex += " & " + " & ".join(classifiers_2) + " \\\\\n\\hline\n"
#     for crypto in group:
#         for c1 in classifiers_1:
#             row = [crypto + " " + c1] + ['' for _ in classifiers_2]
#             for c2 in classifiers_2:
#                 p_value = data[(data['cryptocurrency'] == crypto) & 
#                                (data['classifier1'] == c1) & 
#                                (data['classifier2'] == c2)]['p_value'].values[0]
#                 color = "white" if p_value < 0.05 else "black"
#                 cell_color = "\\cellcolor{" + color + "}"
#                 row[classifiers_2.tolist().index(c2) + 1] = cell_color
#             latex += " & ".join(row) + " \\\\\n\\hline\n"
#     latex += "\\end{tabular}\n\\end{table}\n"
#     return latex

# # Generate LaTeX code for each table
# latex_code = ""
# for i, group in enumerate(cryptos_split):
#     latex_code += generate_latex_table(group, classifiers_1, classifiers_2) + "\n"

# # Print the LaTeX code
# print(latex_code)


FileNotFoundError: [Errno 2] No such file or directory: 'path/to/your/t_test_results.csv'