In [None]:
import pandas as pd

from sklearn.linear_model import LinearRegression
from sklearn.svm import SVR
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
from xgboost import XGBRegressor
from sklearn.ensemble import GradientBoostingRegressor

from sklearn.preprocessing import MinMaxScaler, StandardScaler
from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV

from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score, make_scorer

from matplotlib import pyplot as plt
%matplotlib inline

In [None]:
df = pd.read_csv('decision_tree_data.csv')

In [None]:
df.head()

Unnamed: 0,timestamp,open,high,low,close,volume,close_time,quote_asset_volume,number_of_trades,taker_buy_base_asset_volume,taker_buy_quote_asset_volume
0,1609459200000,28923.63,29031.34,28690.17,28995.13,2311.811445,1609462799999,66768830.0,58389,1215.359238,35103540.0
1,1609462800000,28995.13,29470.0,28960.35,29409.99,5403.068471,1609466399999,158357800.0,103896,3160.041701,92613990.0
2,1609466400000,29410.0,29465.26,29120.03,29194.65,2384.23156,1609469999999,69842650.0,57646,1203.433506,35252750.0
3,1609470000000,29195.25,29367.0,29150.02,29278.4,1461.345077,1609473599999,42760780.0,42510,775.915666,22705550.0
4,1609473600000,29278.41,29395.0,29029.4,29220.31,2038.046803,1609477199999,59614640.0,55414,1003.342834,29346380.0


In [None]:
df.dropna(inplace=True)

In [None]:
# Линейная регрессия

X = df.drop(['close'], axis=1)
y = df['close']

scaler = MinMaxScaler()

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

lr = LinearRegression()
lr.fit(X_train_scaled, y_train)

y_pred = lr.predict(X_test_scaled)

print('MAE:', mean_absolute_error(y_test, y_pred))
print('MSE:', mean_squared_error(y_test, y_pred))
print('R2 score:', r2_score(y_test, y_pred))

MAE: 85.5569782929471
MSE: 19846.365499768923
R2 score: 0.9998948408137114


In [None]:
import torch
import torch.nn as nn
import torch.optim as optim
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn.model_selection import train_test_split

scaler = StandardScaler()

inputs = scaler.fit_transform(df.drop(['close', 'timestamp', 'close_time'], axis=1).values)
targets = df['close'].values

X_train, X_test, y_train, y_test = train_test_split(inputs, targets, test_size=0.2, random_state=42)
X_train, X_test = torch.from_numpy(X_train).float(), torch.from_numpy(X_test).float()
y_train, y_test = torch.from_numpy(y_train).float().view(-1, 1), torch.from_numpy(y_test).float().view(-1, 1)

class LinearRegression(nn.Module):
    def __init__(self, input_size, output_size):
        super().__init__()
        self.weights = nn.Parameter(torch.randn(input_size, output_size))
        self.bias = nn.Parameter(torch.randn(output_size))

    def forward(self, x):
        return x @ self.weights + self.bias

input_size = X_train.shape[1]
output_size = 1
model = LinearRegression(input_size, output_size)

criterion = nn.MSELoss()
optimizer = optim.SGD(model.parameters(), lr=0.1)

num_epochs = 50000
for epoch in range(num_epochs):
    outputs = model(X_train)
    loss = criterion(outputs, y_train)

    optimizer.zero_grad()
    loss.backward()
    optimizer.step()

    if (epoch+1) % 5000 == 0:
        print(f'Epoch [{epoch+1}/{num_epochs}], Loss: {loss.item():.4f}')

print(f'MSE модели на обучающей выборке {criterion(model(X_train), y_train)}')
print(f'MSE модели на тестовой выборке {criterion(model(X_test), y_test)}')

Epoch [5000/50000], Loss: 30515.9141
Epoch [10000/50000], Loss: 28774.6484
Epoch [15000/50000], Loss: 27311.9902
Epoch [20000/50000], Loss: 26079.7441
Epoch [25000/50000], Loss: 25038.8789
Epoch [30000/50000], Loss: 24157.6562
Epoch [35000/50000], Loss: 23409.9980
Epoch [40000/50000], Loss: 22774.5469
Epoch [45000/50000], Loss: 22233.5039
Epoch [50000/50000], Loss: 21772.2598
MSE модели на обучающей выборке 21772.169921875
MSE модели на тестовой выборке 22608.69921875


In [None]:
X = df.drop(['close', 'timestamp', 'close_time'], axis=1)
y = df['close']

scaler = MinMaxScaler()

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

svr = SVR()
svr = GridSearchCV(svr,
                   param_grid={
                      'kernel': ['linear', 'rbf', 'poly'],
                      'C': [1, 10, 50, 100, 250, 500]
                      },
                   verbose=3,
                   scoring=make_scorer(r2_score))

svr.fit(X_train_scaled, y_train)
best_svr = svr.best_estimator_
print(f'Best Score is {svr.best_score_}. Best Params:\n{svr.best_params_}\n')

Fitting 5 folds for each of 18 candidates, totalling 90 fits
[CV 1/5] END ................C=1, kernel=linear;, score=0.304 total time=  11.2s
[CV 2/5] END ................C=1, kernel=linear;, score=0.304 total time=   9.7s
[CV 3/5] END ................C=1, kernel=linear;, score=0.306 total time=  10.9s
[CV 4/5] END ................C=1, kernel=linear;, score=0.305 total time=  11.0s
[CV 5/5] END ................C=1, kernel=linear;, score=0.304 total time=  11.3s
[CV 1/5] END ...................C=1, kernel=rbf;, score=0.394 total time=  12.5s
[CV 2/5] END ...................C=1, kernel=rbf;, score=0.393 total time=  12.0s
[CV 3/5] END ...................C=1, kernel=rbf;, score=0.398 total time=  12.1s
[CV 4/5] END ...................C=1, kernel=rbf;, score=0.399 total time=  13.1s
[CV 5/5] END ...................C=1, kernel=rbf;, score=0.398 total time=  12.2s
[CV 1/5] END ..................C=1, kernel=poly;, score=0.787 total time=   9.0s
[CV 2/5] END ..................C=1, kernel=poly;

In [None]:
y_pred = best_svr.predict(X_test_scaled)
print('MAE:', mean_absolute_error(y_test, y_pred))
print('MSE:', mean_squared_error(y_test, y_pred))
print('R2 score:', r2_score(y_test, y_pred))

MAE: 111.70080843085596
MSE: 32037.736683834846
R2 score: 0.9998302428562932


In [None]:
X = df.drop(['close'], axis=1)
y = df['close']

scaler = MinMaxScaler()

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

gbr = GradientBoostingRegressor()

gbr.fit(X_train_scaled, y_train)

y_pred = gbr.predict(X_test_scaled)

print('MAE:', mean_absolute_error(y_test, y_pred))
print('MSE:', mean_squared_error(y_test, y_pred))
print('R2 score:', r2_score(y_test, y_pred))

MAE: 140.8769867400665
MSE: 43053.64062779806
R2 score: 0.9997718733026842


In [None]:
X = df.drop(['close'], axis=1)
y = df['close']

scaler = MinMaxScaler()

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

rfr = RandomForestRegressor()
rfr = GridSearchCV(rfr,
                   param_grid={
                        'n_estimators': [50, 150, 300],
                        'max_depth': [3, 5, 7, 10],
                        'min_samples_split': [5, 10, 15],
                        'min_samples_leaf': [5, 10, 15],
                        'bootstrap': [True]
                        },
                   scoring=make_scorer(r2_score),
                   verbose=3)
rfr.fit(X_train_scaled, y_train)
print(f'Best score is {rfr.best_score_}. Best Params:\n{rfr.best_params_}\n')

Fitting 5 folds for each of 108 candidates, totalling 540 fits
[CV 1/5] END bootstrap=True, max_depth=3, min_samples_leaf=5, min_samples_split=5, n_estimators=50;, score=0.986 total time=   2.1s
[CV 2/5] END bootstrap=True, max_depth=3, min_samples_leaf=5, min_samples_split=5, n_estimators=50;, score=0.986 total time=   1.7s
[CV 3/5] END bootstrap=True, max_depth=3, min_samples_leaf=5, min_samples_split=5, n_estimators=50;, score=0.985 total time=   1.7s
[CV 4/5] END bootstrap=True, max_depth=3, min_samples_leaf=5, min_samples_split=5, n_estimators=50;, score=0.987 total time=   1.7s
[CV 5/5] END bootstrap=True, max_depth=3, min_samples_leaf=5, min_samples_split=5, n_estimators=50;, score=0.987 total time=   1.7s
[CV 1/5] END bootstrap=True, max_depth=3, min_samples_leaf=5, min_samples_split=5, n_estimators=150;, score=0.986 total time=   5.8s
[CV 2/5] END bootstrap=True, max_depth=3, min_samples_leaf=5, min_samples_split=5, n_estimators=150;, score=0.986 total time=   4.9s
[CV 3/5] EN

In [None]:
best_rfr = rfr.best_estimator_
y_pred = best_rfr.predict(X_test_scaled)
print('MSE: ', mean_squared_error(y_test, y_pred))
print('MAE: ', mean_absolute_error(y_test, y_pred))
print('r2_score: ', r2_score(y_test, y_pred))

MSE:  27888.174024937285
MAE:  100.25928402216894
r2_score:  0.9998522299870184


In [None]:
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.ensemble import StackingRegressor
estimators = [('lr', lr),
              ('rfr', best_rfr),
              ('svr', best_svr)]
final_estimator = gbr
stack_reg = StackingRegressor(
    estimators=estimators,
    final_estimator=final_estimator)

In [None]:
stack_reg.fit(X_train_scaled, y_train)

In [None]:
y_pred = stack_reg.predict(X_test_scaled)
print('MSE: ', mean_squared_error(y_test, y_pred))
print('MAE: ', mean_absolute_error(y_test, y_pred))
print('r2_score: ', r2_score(y_test, y_pred))