In [1]:
import matplotlib
matplotlib.use('TkAgg')
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import math
from new_datasets_py import create_subsets
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import RepeatedStratifiedKFold
from sklearn.metrics import balanced_accuracy_score
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import make_pipeline


In [2]:
data = pd.read_csv('crypto-markets.csv')
filtered_data = data[data['ranknow'] < 30]
filtered_data.loc[:, 'date'] = pd.to_datetime(filtered_data['date'])
filtered_data.set_index('date', inplace=True)
datasets_with_labels = []

# Group the filtered data by 'slug'
grouped = filtered_data.groupby('slug')

# Iterate over each group
for crypto, group in grouped:
    close_values = group['close'].values

    # Iterate over the windows of 10 days
    for start in range(len(close_values) - 9):
        end = start + 10
        window = close_values[start:end]

        # Calculate label based on values at day 7 and day 10
        value_day_7 = window[6]  # Index 6 corresponds to the 7th day (0-indexed)
        value_day_10 = window[9]  # Index 9 corresponds to the 10th day (0-indexed)
        label = 1 if value_day_10 > value_day_7 else 0

        # Append the dataset and label as a tuple to the datasets_with_labels list
        datasets_with_labels.append((window, label))

# Convert the list of tuples to a DataFrame
combined_table = pd.DataFrame(datasets_with_labels, columns=['close_values', 'label'])

# Print to check
print(combined_table)
missing_values = combined_table.isnull().sum()
print(missing_values)

                                            close_values  label
0      [0.695589, 0.742796, 0.86392, 0.734774, 1.07, ...      0
1      [0.742796, 0.86392, 0.734774, 1.07, 1.43, 1.33...      1
2      [0.86392, 0.734774, 1.07, 1.43, 1.33, 1.4, 1.4...      1
3      [0.734774, 1.07, 1.43, 1.33, 1.4, 1.4, 1.31, 2...      1
4      [1.07, 1.43, 1.33, 1.4, 1.4, 1.31, 2.38, 3.18,...      1
...                                                  ...    ...
21966  [0.125544, 0.122172, 0.119888, 0.117055, 0.100...      1
21967  [0.122172, 0.119888, 0.117055, 0.100511, 0.112...      1
21968  [0.119888, 0.117055, 0.100511, 0.112595, 0.111...      1
21969  [0.117055, 0.100511, 0.112595, 0.11143, 0.1131...      1
21970  [0.100511, 0.112595, 0.11143, 0.113184, 0.1123...      1

[21971 rows x 2 columns]
close_values    0
label           0
dtype: int64


  return Index(sequences[0], name=names)


In [6]:
X = []
y = []
X = combined_table['close_values'].apply(lambda x: x[:7]).tolist()
y = combined_table['label'].astype(int).tolist()
X = np.array(X)
y = np.array(y)


(21971, 7)
(21971,)


In [7]:
from sklearn.neural_network import MLPClassifier
from sklearn.model_selection import RepeatedStratifiedKFold
from sklearn.metrics import balanced_accuracy_score
rskf = RepeatedStratifiedKFold(n_splits=5, n_repeats=2)

for train, test in rskf.split(X, y):
    model = MLPClassifier()
    model.fit(X[train], y[train])
    y_pred = model.predict(X[test])
    acc = balanced_accuracy_score(y_pred,y[test])

    print(acc)

0.5345528143286075
0.48223092998955064
0.5233953956810729
0.5072359274400438
0.5443611421580474
0.5373109431507699
0.534475867404101
0.533239412844553
0.5152752997151532
0.5267979037301272


In [9]:
classifiers = {
    "Decision Tree": DecisionTreeClassifier(),
    "Random Forest": RandomForestClassifier(),
    "Logistic Regression": LogisticRegression(max_iter=1000),
    "Support Vector Machine": SVC(),
    "K-Nearest Neighbors": KNeighborsClassifier()
}

for clf_name, clf in classifiers.items():
    print(f"Evaluating {clf_name}:")
    for train, test in rskf.split(X, y):
        model = clf
        model.fit(X[train], y[train])
        y_pred = model.predict(X[test])
        acc = balanced_accuracy_score(y_pred, y[test])

        print(f"   Balanced Accuracy: {acc}")


Evaluating Decision Tree:
   Balanced Accuracy: 0.5565747266967559
   Balanced Accuracy: 0.5439225620106141
   Balanced Accuracy: 0.5383348483759889
   Balanced Accuracy: 0.5540798639976059
   Balanced Accuracy: 0.5478193311261443
   Balanced Accuracy: 0.5428122510788407
   Balanced Accuracy: 0.5311982360650784
   Balanced Accuracy: 0.5497260218328468
   Balanced Accuracy: 0.5367542062457317
   Balanced Accuracy: 0.5592341271826687
Evaluating Random Forest:
   Balanced Accuracy: 0.5746243377583318
   Balanced Accuracy: 0.5658946845222927
   Balanced Accuracy: 0.5612512309880731
