In [1]:
import pandas as pd 
import numpy as np
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import train_test_split

In [2]:
df = pd.read_csv('sol.csv')
df.head(5) 

Unnamed: 0,Date,Open,High,Low,Close,Volume,Dividends,Stock Splits
0,2020-04-10 00:00:00+00:00,0.832005,1.313487,0.694187,0.951054,87364276,0.0,0.0
1,2020-04-11 00:00:00+00:00,0.951054,1.049073,0.76502,0.776819,43862444,0.0,0.0
2,2020-04-12 00:00:00+00:00,0.785448,0.95667,0.762426,0.882507,38736897,0.0,0.0
3,2020-04-13 00:00:00+00:00,0.89076,0.891603,0.773976,0.777832,18211285,0.0,0.0
4,2020-04-14 00:00:00+00:00,0.777832,0.796472,0.628169,0.661925,16747614,0.0,0.0


In [3]:
df.columns

Index(['Date', 'Open', 'High', 'Low', 'Close', 'Volume', 'Dividends',
       'Stock Splits'],
      dtype='object')

In [4]:
df.drop([ 'Open', 'High', 'Low',  'Dividends', 'Stock Splits', 'Volume'], inplace = True, axis = 1)

In [5]:
df['time'] = pd.to_datetime(df['Date'])
df['formatted_time'] = df['time'].dt.strftime('%d-%m-%Y')
df['formatted_time']

0       10-04-2020
1       11-04-2020
2       12-04-2020
3       13-04-2020
4       14-04-2020
           ...    
1511    30-05-2024
1512    31-05-2024
1513    01-06-2024
1514    02-06-2024
1515    03-06-2024
Name: formatted_time, Length: 1516, dtype: object

In [6]:
df.drop([ 'Date', 'time'], inplace = True, axis = 1)

In [7]:
df

Unnamed: 0,Close,formatted_time
0,0.951054,10-04-2020
1,0.776819,11-04-2020
2,0.882507,12-04-2020
3,0.777832,13-04-2020
4,0.661925,14-04-2020
...,...,...
1511,166.976532,30-05-2024
1512,165.637711,31-05-2024
1513,166.030670,01-06-2024
1514,163.249908,02-06-2024


In [8]:
# df['Symbol'].unique()

In [9]:
# df[df['Symbol'] == 'BTC']['Close'].plot()

In [10]:
df.describe()

Unnamed: 0,Close
count,1516.0
mean,54.675174
std,59.301607
min,0.515273
25%,14.102368
50%,30.028152
75%,91.004284
max,258.934326


In [11]:
# for i in df['Symbol'].unique():
#     print(len(df[df['Symbol'] == i]))

In [12]:
# state = pd.get_dummies(df['Symbol'])
# state

In [13]:
# df = df.drop('Symbol', axis = 1) 
# df = pd.concat([df, state], axis = 1)
# df.head()

In [14]:
# df = df.replace({True : 1, False : 0})

In [15]:
import pandas_ta as ta
 
df['SMA'] = ta.sma(df['Close'], length=5) 
df['EMA'] = ta.ema(df['Close'], length=5) 
df['RSI'] = ta.rsi(df['Close'], length=14) 

In [16]:
df

Unnamed: 0,Close,formatted_time,SMA,EMA,RSI
0,0.951054,10-04-2020,,,
1,0.776819,11-04-2020,,,
2,0.882507,12-04-2020,,,
3,0.777832,13-04-2020,,,
4,0.661925,14-04-2020,0.810027,0.810027,
...,...,...,...,...,...
1511,166.976532,30-05-2024,167.409421,168.110910,53.379252
1512,165.637711,31-05-2024,167.866782,167.286510,52.208216
1513,166.030670,01-06-2024,167.073688,166.867897,52.537340
1514,163.249908,02-06-2024,165.993500,165.661901,49.917586


In [17]:
df.fillna(0, inplace = True)

In [18]:
new_df = df.copy()

In [19]:
X = new_df.drop(['Close', 'formatted_time'], axis = 1).values
Y = new_df['Close'].values

In [20]:
X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size=0.2, random_state=42)

In [22]:
from sklearn.model_selection import GridSearchCV
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score 


In [23]:
rf = RandomForestRegressor()
 
param_grid = {
    'n_estimators': [50, 100, 150, 200, 250, 300]
}
grid_search_rf = GridSearchCV(estimator=rf, param_grid=param_grid, cv=3)
grid_search_rf.fit(X_train, y_train)


In [24]:
best_rf = grid_search_rf.best_estimator_
best_rf

In [25]:
y_pred_rf = best_rf.predict(X_test)
y_pred_rf

array([  0.57398418,   3.11113157, 138.88217278,  29.17053645,
        40.52775465,  21.0006637 , 220.58929749,   2.19228852,
       136.98316887, 195.4511026 ,  21.12139722,   0.5820874 ,
       137.98175171,   2.52904871,  22.37228851,  39.29281151,
        39.04195158, 202.24065926, 148.92603775,  33.03468601,
       141.81044395,   1.46075964,  23.46902749,   4.31960642,
        37.62653353,  28.22295934,  31.67650935,   2.06255616,
        14.86146441, 102.50927185,   3.02763651,  87.57714466,
       211.53782173,  28.89375812,  33.55045043,  71.26788651,
        32.31745452,   0.73011699,  17.15591996,  20.21793642,
        40.43924988,  29.67565725,  65.71894148,  56.26903301,
       136.71926392,  39.40364243,  45.63544153,  14.19451745,
        23.23773142,  88.71859188,   0.6491363 ,  23.88583153,
        11.49120523, 160.08709152,  26.48116061, 162.19011475,
        28.47025234,  33.76229431,  31.4096194 ,  59.96463793,
        13.90802077,   3.16785934,  56.15881277,  21.80

In [26]:
rf_metrics = {
    'RMSE': np.sqrt(mean_squared_error(y_test, y_pred_rf)),
    'MSE': mean_squared_error(y_test, y_pred_rf),
    'MAE': mean_absolute_error(y_test, y_pred_rf),
    'R2': r2_score(y_test, y_pred_rf)
}
print(rf_metrics)


{'RMSE': 2.725271878696142, 'MSE': 7.427106812811998, 'MAE': 1.3265227134437547, 'R2': 0.9977405883446301}


In [27]:
len(df)

1516

In [28]:
data = df [len(df) - 15:]

In [29]:
for i in range(2):
    sma = ta.sma(data['Close'], length=5).iloc[-1]
    ema = ta.ema(data['Close'], length=5).iloc[-1]
    rsi = ta.rsi(data['Close'], length=14).iloc[-1]
    # sma_5, ema_5, rsi_14 = scaler.transform(np.array([sma, ema, rsi]).reshape(-1, 1))
    # new_close = best_rf.predict([[ sma_5[0], ema_5[0], rsi_14[0]]])
    new_close = best_rf.predict([[sma, ema, rsi]])
    print(new_close)
    new_row = pd.DataFrame({"Close": new_close, "formatted_time" : "0", "SMA" : sma, "EMA" : ema, "RSI" : rsi})
    data = pd.concat([data, new_row], ignore_index= True)
    

[155.66067451]
[153.45958481]


In [30]:

# Evaluate the model
rf_metrics = {
    'RMSE': np.sqrt(mean_squared_error(y_test, y_pred_rf)),
    'MSE': mean_squared_error(y_test, y_pred_rf),
    'MAE': mean_absolute_error(y_test, y_pred_rf),
    'R2': r2_score(y_test, y_pred_rf)
}
print(rf_metrics)


{'RMSE': 2.725271878696142, 'MSE': 7.427106812811998, 'MAE': 1.3265227134437547, 'R2': 0.9977405883446301}


In [31]:
import pickle
with open('sol.pkl', 'wb') as f:
    pickle.dump(best_rf, f)