In [58]:
import numpy as np
import pandas as pd
from pandas_datareader import data

In [101]:
df_russell = data.get_data_yahoo('^RUT')

In [102]:
df_russell.head()

Unnamed: 0_level_0,High,Low,Open,Close,Volume,Adj Close
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
2015-09-02,1146.030029,1128.050049,1128.050049,1146.030029,37426200,1146.030029
2015-09-03,1157.630005,1143.790039,1148.619995,1145.150024,35207000,1145.150024
2015-09-04,1142.550049,1130.75,1132.949951,1136.170044,31670900,1136.170044
2015-09-08,1162.449951,1139.069946,1139.069946,1161.76001,35486500,1161.76001
2015-09-09,1170.199951,1147.329956,1162.719971,1148.22998,36521200,1148.22998


In [103]:
df_russell.shape

(1258, 6)

In [104]:
df_russell = df_russell.reset_index()
df_russell

Unnamed: 0,Date,High,Low,Open,Close,Volume,Adj Close
0,2015-09-02,1146.030029,1128.050049,1128.050049,1146.030029,37426200,1146.030029
1,2015-09-03,1157.630005,1143.790039,1148.619995,1145.150024,35207000,1145.150024
2,2015-09-04,1142.550049,1130.750000,1132.949951,1136.170044,31670900,1136.170044
3,2015-09-08,1162.449951,1139.069946,1139.069946,1161.760010,35486500,1161.760010
4,2015-09-09,1170.199951,1147.329956,1162.719971,1148.229980,36521200,1148.229980
...,...,...,...,...,...,...,...
1253,2020-08-25,1574.239990,1556.780029,1572.650024,1571.209961,36193000,1571.209961
1254,2020-08-26,1572.569946,1559.729980,1570.719971,1560.189941,37543600,1560.189941
1255,2020-08-27,1573.380005,1555.270020,1561.359985,1564.560059,39295600,1564.560059
1256,2020-08-28,1578.339966,1567.130005,1570.459961,1578.339966,38558800,1578.339966


In [105]:
# remove the data column
df_russell.drop(['Date', 'Low', 'Open', 'Close', 'Volume', 'Adj Close'], axis=1, inplace=True)
df_russell

Unnamed: 0,High
0,1146.030029
1,1157.630005
2,1142.550049
3,1162.449951
4,1170.199951
...,...
1253,1574.239990
1254,1572.569946
1255,1573.380005
1256,1578.339966


In [106]:
# create a variable to predict 'n' days out there in the future
prediction_days = 30
# create another column shifted 'n' units up
df_russell['Prediction'] = df_russell[['High']].shift(-prediction_days)
# check it
df_russell.tail(10)

Unnamed: 0,High,Prediction
1248,1585.540039,
1249,1585.160034,
1250,1570.380005,
1251,1563.369995,
1252,1568.72998,
1253,1574.23999,
1254,1572.569946,
1255,1573.380005,
1256,1578.339966,
1257,1577.549316,


In [107]:
# create the independent data set
# convert the data frame into a numpy array and drop the
x = np.array(df_russell.drop(['Prediction'], axis=1))
# remove the last 'n' rows where 'n' is the prediction_days
x = x[: len(df_russell) - prediction_days]
print(x)

[[1146.0300293 ]
 [1157.63000488]
 [1142.55004883]
 ...
 [1473.7800293 ]
 [1482.40002441]
 [1474.36999512]]


In [108]:
# create the dependent data set
# convert the data frame into a numpy array
y = np.array(df_russell['Prediction'])
# get all the values except last 'n' rows
y = y[:-prediction_days]
print(y)

[1162.81994629 1165.22998047 1168.92004395 ... 1573.38000488 1578.33996582
 1577.54931641]


In [109]:
# split the data into 80% training and 20% testing
from sklearn.model_selection import train_test_split
x_train, x_test, y_train, y_test = train_test_split(x,y, test_size = 0.2)
# set the prediction days array equal to last 30 rows from the orginal
prediction_days_array = np.array(df_russell.drop(['Prediction'], axis=1))[-prediction_days:]
print(prediction_days_array)

[[1496.72998047]
 [1493.56005859]
 [1507.23999023]
 [1488.67004395]
 [1484.66003418]
 [1485.33996582]
 [1501.85998535]
 [1499.19995117]
 [1495.15002441]
 [1507.93005371]
 [1517.2199707 ]
 [1546.40002441]
 [1550.4699707 ]
 [1569.26000977]
 [1593.82995605]
 [1603.59997559]
 [1595.11999512]
 [1590.08996582]
 [1583.47998047]
 [1586.52001953]
 [1585.54003906]
 [1585.16003418]
 [1570.38000488]
 [1563.36999512]
 [1568.72998047]
 [1574.23999023]
 [1572.56994629]
 [1573.38000488]
 [1578.33996582]
 [1577.54931641]]


In [111]:
# ML Model
from sklearn.svm import SVR

In [112]:
# create and train the Support Vector Machine Regression using radial basis function
svr_rbf = SVR(kernel='rbf', C=1e3, gamma = 0.00001)
svr_rbf.fit(x_train, y_train)

SVR(C=1000.0, cache_size=200, coef0=0.0, degree=3, epsilon=0.1, gamma=1e-05,
    kernel='rbf', max_iter=-1, shrinking=True, tol=0.001, verbose=False)

In [113]:
# test hte model

In [114]:
svr_rbf_confidence = svr_rbf.score(x_test, y_test)
print(f"SVR_RBF accuracy: {svr_rbf_confidence}")

SVR_RBF accuracy: 0.7117934074229273


In [115]:
# print the predcition values
svm_prediction = svr_rbf.predict(x_test)
print(svm_prediction)
print("-" * 20)
print(y_test)

[1642.28757353 1586.71406413 1171.8460659  1473.76549236 1136.37650844
 1653.3709831  1222.424609   1414.22678921 1239.00139779 1635.5176163
 1546.81499996 1570.45728759 1264.98605238 1653.22902183 1258.07515778
 1333.20391261 1603.47498322 1515.32431014 1577.49318413 1191.45901032
 1230.89061314 1557.7186229  1437.44115534 1225.4883198  1446.09079214
 1510.39052667 1579.32136401 1215.99575471 1565.58553093 1132.44092716
 1550.35182052 1526.36027926 1653.07750149 1209.61828731 1585.52566348
 1195.25306553 1571.16641004 1651.50930418 1537.50647442 1651.6788836
 1517.27702542 1558.26613003 1369.753021   1100.69136721 1475.43493884
 1101.1786798  1189.16966721 1606.93461441 1151.55942536 1275.77375323
 1393.65347709 1648.5650049  1590.96316032 1252.8350403  1652.49335134
 1169.19676801 1174.3202404  1652.21323272 1629.26297681 1418.889775
 1204.70459419 1605.73945122 1459.30620149 1529.08029202 1410.41724956
 1513.81703173 1189.5142436  1157.9287341  1543.97732338 1566.61166084
 1406.7420