In [1]:
import pandas as pd 
import numpy as np
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import train_test_split

In [2]:
import os  
import json

In [3]:
df = pd.read_csv('btc.csv')
df.head(5) 

Unnamed: 0,Date,Open,High,Low,Close,Volume,Dividends,Stock Splits
0,2014-09-17 00:00:00+00:00,465.864014,468.174011,452.421997,457.334015,21056800,0.0,0.0
1,2014-09-18 00:00:00+00:00,456.859985,456.859985,413.104004,424.440002,34483200,0.0,0.0
2,2014-09-19 00:00:00+00:00,424.102997,427.834991,384.532013,394.79599,37919700,0.0,0.0
3,2014-09-20 00:00:00+00:00,394.673004,423.29599,389.882996,408.903992,36863600,0.0,0.0
4,2014-09-21 00:00:00+00:00,408.084991,412.425995,393.181,398.821014,26580100,0.0,0.0


In [5]:
df.columns

Index(['Date', 'Open', 'High', 'Low', 'Close', 'Volume', 'Dividends',
       'Stock Splits'],
      dtype='object')

In [6]:
df.drop([ 'Open', 'High', 'Low',  'Dividends', 'Stock Splits', 'Volume'], inplace = True, axis = 1)

In [7]:
df['time'] = pd.to_datetime(df['Date'])
df['formatted_time'] = df['time'].dt.strftime('%d-%m-%Y')
df['formatted_time']

0       17-09-2014
1       18-09-2014
2       19-09-2014
3       20-09-2014
4       21-09-2014
           ...    
3543    30-05-2024
3544    31-05-2024
3545    01-06-2024
3546    02-06-2024
3547    03-06-2024
Name: formatted_time, Length: 3548, dtype: object

In [8]:
df.drop([ 'Date', 'time'], inplace = True, axis = 1)

In [9]:
df

Unnamed: 0,Close,formatted_time
0,457.334015,17-09-2014
1,424.440002,18-09-2014
2,394.795990,19-09-2014
3,408.903992,20-09-2014
4,398.821014,21-09-2014
...,...,...
3543,68364.992188,30-05-2024
3544,67491.414062,31-05-2024
3545,67706.937500,01-06-2024
3546,67751.601562,02-06-2024


In [18]:
import pandas_ta as ta
 
df['SMA'] = ta.sma(df['Close'], length=5) 
df['EMA'] = ta.ema(df['Close'], length=5) 
df['RSI'] = ta.rsi(df['Close'], length=14) 

In [20]:
df.fillna(0, inplace = True)

In [21]:
new_df = df.copy()

In [24]:
X = new_df.drop(['Close', 'formatted_time'], axis = 1).values
Y = new_df['Close'].values

In [25]:
X

array([[0.00000000e+00, 0.00000000e+00, 0.00000000e+00],
       [0.00000000e+00, 0.00000000e+00, 0.00000000e+00],
       [0.00000000e+00, 0.00000000e+00, 0.00000000e+00],
       ...,
       [6.78875312e+04, 6.79410445e+04, 5.29987433e+01],
       [6.77786078e+04, 6.78778969e+04, 5.31387865e+01],
       [6.80777047e+04, 6.82764573e+04, 5.72033397e+01]])

In [26]:
Y

array([  457.33401489,   424.44000244,   394.79598999, ...,
       67706.9375    , 67751.6015625 , 69073.578125  ])

In [27]:
X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size=0.2, random_state=42)

In [28]:
from sklearn.model_selection import GridSearchCV
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score 


In [29]:
rf = RandomForestRegressor()
 
param_grid = {
    'n_estimators': [50, 100, 150, 200, 250, 300]
}
grid_search_rf = GridSearchCV(estimator=rf, param_grid=param_grid, cv=3)
grid_search_rf.fit(X_train, y_train)


In [30]:
best_rf = grid_search_rf.best_estimator_
best_rf

In [31]:
y_pred_rf = best_rf.predict(X_test) 

array([  296.52546651, 46194.86747396, 24178.21059896, 48155.75710938,
       14957.02883464,  8100.49818685,  3691.2543571 ,  9149.37807292,
        9106.26328776, 24056.54048177,   712.74038778,   531.10718669,
       10098.1198763 , 26053.34567708,  8463.90218099, 67866.64440104,
         361.01270203, 20710.62946615, 16104.5318099 ,  6498.16445638,
        5763.00170573, 52198.85075521, 26316.3050651 ,  8687.18435547,
       48280.08502604, 19370.81420573,   237.43453115,   366.11615417,
        4331.74171875, 13751.1035612 , 30174.07259115,   225.59937042,
       66728.94328125, 19751.02786458, 19233.3300651 ,  4408.52001302,
        1010.38261759,   236.26087179,   310.53152425,   943.48327189,
       26928.12545573, 25691.04388021, 37008.86270833,   315.76996847,
       61532.67695313,  9460.42238932,   609.7626355 , 28394.81283854,
       39290.80018229, 10412.76126953,   241.2175708 , 50198.35674479,
         448.80611287,  6475.76320964, 42444.49557292,  9812.15233073,
      

In [32]:
rf_metrics = {
    'RMSE': np.sqrt(mean_squared_error(y_test, y_pred_rf)),
    'MSE': mean_squared_error(y_test, y_pred_rf),
    'MAE': mean_absolute_error(y_test, y_pred_rf),
    'R2': r2_score(y_test, y_pred_rf)
}
print(rf_metrics)


{'RMSE': 550.7509947507542, 'MSE': 303326.6582189453, 'MAE': 264.3326364674436, 'R2': 0.9990843529617618}


In [34]:
for i in range(2):
    sma = ta.sma(data['Close'], length=5).iloc[-1]
    ema = ta.ema(data['Close'], length=5).iloc[-1]
    rsi = ta.rsi(data['Close'], length=14).iloc[-1]
    # sma_5, ema_5, rsi_14 = scaler.fit_transform(np.array([sma, ema, rsi]).reshape(-1, 1))
    # new_close = best_rf.predict([[0.163528, 0.0, 0.0, 0.0, 1.0, 0.0, sma_5[0], ema_5[0], rsi_14[0]]])
    new_close = best_rf.predict([[sma, ema, rsi]])
    print(new_close)
    new_row = pd.DataFrame({"Close": new_close, "Volume": 21292490752, "formatted_time" : "0", "SMA" : sma, "EMA" : ema, "RSI" : rsi})
    data = pd.concat([data, new_row], ignore_index= True)
    

[68950.04520833]
[68783.3953125]
[68533.40567708]
[68396.24924479]
[67618.65583333]
[66222.8146875]
[64616.13161458]
[64177.259375]
[62832.32619792]
[62783.76028646]


In [35]:
import pickle 
with open('btc.pkl', 'wb') as f:
    pickle.dump(best_rf, f)