In [15]:
import pandas as pd
import numpy as np
from statsmodels.tsa.arima.model import ARIMA
from sklearn.metrics import mean_squared_error
from concurrent.futures import ThreadPoolExecutor
import warnings
warnings.filterwarnings('ignore')

In [16]:
#load data
data = pd.read_csv("..\..\..\Data\Day\S&P 500 Historical Data00-20.csv")

In [17]:
#transform data
data = data.replace(",", "", regex=True)
data.Date = pd.to_datetime(data.Date)
data = data.sort_values(by=["Date"])
data.set_index('Date')
data.Price = data.Price.astype(float)
data.High = data.High.astype(float)
data.Low = data.Low.astype(float)
data.Open = data.Open.astype(float)
data["Vol."] = data["Vol."].astype(float)

In [18]:
data.head()

Unnamed: 0,Date,Price,Open,High,Low,Vol.,Change %
4999,2000-01-03,1455.2,1469.2,1478.0,1438.4,,-0.95%
4998,2000-01-04,1399.4,1455.2,1455.2,1397.4,,-3.83%
4997,2000-01-05,1402.1,1399.4,1413.3,1377.7,,0.19%
4996,2000-01-06,1403.5,1402.1,1411.9,1392.0,,0.10%
4995,2000-01-07,1441.5,1403.5,1441.5,1400.5,,2.71%


In [19]:
data.index = data.index[::-1]

In [20]:
data.head()

Unnamed: 0,Date,Price,Open,High,Low,Vol.,Change %
0,2000-01-03,1455.2,1469.2,1478.0,1438.4,,-0.95%
1,2000-01-04,1399.4,1455.2,1455.2,1397.4,,-3.83%
2,2000-01-05,1402.1,1399.4,1413.3,1377.7,,0.19%
3,2000-01-06,1403.5,1402.1,1411.9,1392.0,,0.10%
4,2000-01-07,1441.5,1403.5,1441.5,1400.5,,2.71%


In [21]:
special_data = data[(data['Date'] > '2010-01-01') & (data['Date'] < '2014-01-01')]
special_data.index = data.index[:len(special_data)]

In [22]:
special_data.head(10)

Unnamed: 0,Date,Price,Open,High,Low,Vol.,Change %
0,2010-01-04,1132.99,1116.56,1133.87,1116.56,,1.60%
1,2010-01-05,1136.52,1132.66,1136.63,1129.66,,0.31%
2,2010-01-06,1137.14,1135.71,1139.19,1133.95,,0.05%
3,2010-01-07,1141.69,1136.27,1142.46,1131.32,,0.40%
4,2010-01-08,1144.98,1140.52,1145.39,1136.22,,0.29%
5,2010-01-11,1146.98,1145.96,1149.74,1142.02,,0.17%
6,2010-01-12,1136.22,1143.81,1143.81,1131.77,,-0.94%
7,2010-01-13,1145.68,1137.31,1148.4,1133.18,,0.83%
8,2010-01-14,1148.46,1145.68,1150.41,1143.8,,0.24%
9,2010-01-15,1136.03,1147.72,1147.77,1131.39,,-1.08%


In [23]:
special_data = special_data.drop(columns=['Open', 'High', 'Low', 'Vol.', 'Change %'])

In [24]:
special_data.head()

Unnamed: 0,Date,Price
0,2010-01-04,1132.99
1,2010-01-05,1136.52
2,2010-01-06,1137.14
3,2010-01-07,1141.69
4,2010-01-08,1144.98


In [18]:
# Функция для построения модели ARIMA и получения прогноза
def build_arima_model_and_forecast(data, p, d, q, window_size=200, forecast_days=90):
    forecasts = []

    for i in range(len(data) - window_size - forecast_days + 1):
        try:
            window = data[i:i + window_size]
            actual_values = data['Price'][i + window_size:i + window_size + forecast_days]

            # Создаем и обучаем модель ARIMA
            model = ARIMA(window['Price'], order=(p, d, q))
            results = model.fit()

            # Делаем прогноз на 90 дней вперед
            forecast = results.get_forecast(steps=forecast_days)
            forecast_mean = forecast.predicted_mean.values

            # Рассчитываем относительное отклонение в процентах (Percentage Error)
            percentage_error = ((forecast_mean - actual_values) / actual_values) * 100

            # Добавляем результаты в список
            result = {
                'p': p,
                'd': d,
                'q': q,
                'begin_date': window['Date'].iloc[0],
                'end_date': window['Date'].iloc[-1] + pd.Timedelta(days=forecast_days - 1),
                'window_size': window_size,
                'forecast_days': forecast_days,
                'forecast_precision': np.mean(np.abs(percentage_error))
            }
            forecasts.append(result)
        except:
            continue

    return forecasts

In [20]:
# Задаем значения p, d, q
p_values = range(0, 11)
d_values = range(0, 11)
q_values = range(0, 11)

# Создаем пустой DataFrame для сохранения результатов
columns = ['p', 'd', 'q', 'begin_date', 'end_date', 'window_size', 'forecast_days', 'forecast_precision']
results_df = pd.DataFrame(columns=columns)

# Проходим по всем комбинациям p, d, q и строим прогнозы
for p in p_values:
    for d in d_values:
        for q in q_values:
            if p != 0 and d != 0 and q != 0:
                continue
            # Строим прогноз для текущих значений p, d, q
            forecasts = build_arima_model_and_forecast(special_data, p, d, q)
            print(f'Calculating:({p}, {d}, {q})')
            # Добавляем результаты в DataFrame
            results_df = pd.concat([results_df, pd.DataFrame(forecasts)], ignore_index = True)#results_df.append(forecasts, ignore_index=True)

# Выводим результаты
print(results_df)

Calculating:(0, 0, 0)
Calculating:(0, 0, 1)
Calculating:(0, 0, 2)
Calculating:(0, 0, 3)
Calculating:(0, 0, 4)
Calculating:(0, 0, 5)
Calculating:(0, 0, 6)
Calculating:(0, 0, 7)
Calculating:(0, 0, 8)
Calculating:(0, 0, 9)
Calculating:(0, 0, 10)
Calculating:(0, 1, 0)
Calculating:(0, 1, 1)
Calculating:(0, 1, 2)
Calculating:(0, 1, 3)
Calculating:(0, 1, 4)
Calculating:(0, 1, 5)
Calculating:(0, 1, 6)
Calculating:(0, 1, 7)
Calculating:(0, 1, 8)
Calculating:(0, 1, 9)
Calculating:(0, 1, 10)
Calculating:(0, 2, 0)
Calculating:(0, 2, 1)
Calculating:(0, 2, 2)
Calculating:(0, 2, 3)
Calculating:(0, 2, 4)
Calculating:(0, 2, 5)
Calculating:(0, 2, 6)
Calculating:(0, 2, 7)
Calculating:(0, 2, 8)
Calculating:(0, 2, 9)
Calculating:(0, 2, 10)
Calculating:(0, 3, 0)
Calculating:(0, 3, 1)
Calculating:(0, 3, 2)
Calculating:(0, 3, 3)
Calculating:(0, 3, 4)
Calculating:(0, 3, 5)
Calculating:(0, 3, 6)
Calculating:(0, 3, 7)
Calculating:(0, 3, 8)
Calculating:(0, 3, 9)
Calculating:(0, 3, 10)
Calculating:(0, 4, 0)
Calcul

In [78]:
results_df.to_csv('output2.csv', encoding='utf8')

NameError: name 'results_df' is not defined

In [2]:
import pandas as pd

In [79]:
# Замените 'your_file.csv' на путь к вашему CSV-файлу
file_path = 'output.csv'

# Укажите столбцы, которые вы хотите считать (пропустите первый столбец)
columns_to_read = [1, 2, 3, 4, 5, 6, 7, 8]  # Замените на фактические номера столбцов

# Считайте CSV файл, пропустив первый столбец
df = pd.read_csv(file_path, usecols=columns_to_read)

# Ваш DataFrame теперь содержит только указанные столбцы
print(df)

         p   d  q  begin_date    end_date  window_size  forecast_days  \
0        1   1  1  2010-01-04  2011-01-15          200             90   
1        1   1  1  2010-01-05  2011-01-16          200             90   
2        1   1  1  2010-01-06  2011-01-17          200             90   
3        1   1  1  2010-01-07  2011-01-18          200             90   
4        1   1  1  2010-01-08  2011-01-19          200             90   
...     ..  .. ..         ...         ...          ...            ...   
942962  10  10  0  2012-10-31  2013-11-13          200             90   
942963  10  10  0  2012-11-01  2013-11-16          200             90   
942964  10  10  0  2012-11-02  2013-11-17          200             90   
942965  10  10  0  2012-11-05  2013-11-18          200             90   
942966  10  10  0  2012-11-06  2013-11-19          200             90   

        forecast_precision  
0             5.247528e+00  
1             6.662312e+00  
2             5.932073e+00  
3      

In [80]:
# Преобразуйте столбец 'forecast_precision' к типу данных float
df['forecast_precision'] = pd.to_numeric(df['forecast_precision'], errors='coerce')

# Получите 10 строк с наименьшими значениями в столбце 'forecast_precision'
top_10_rows = df.nsmallest(10, 'forecast_precision', 'all')

In [81]:
top_10_rows

Unnamed: 0,p,d,q,begin_date,end_date,window_size,forecast_days,forecast_precision
296139,5,3,1,2012-02-10,2013-02-24,200,90,0.683348
367378,6,3,2,2012-02-10,2013-02-24,200,90,0.687887
156197,3,3,2,2012-02-22,2013-03-05,200,90,0.712689
651058,10,3,3,2012-02-13,2013-02-25,200,90,0.748898
300474,5,3,7,2012-04-03,2013-04-17,200,90,0.779713
87556,2,3,4,2012-03-20,2013-04-03,200,90,0.783506
17927,1,3,6,2010-01-06,2011-01-17,200,90,0.788216
297042,5,3,3,2010-01-06,2011-01-17,200,90,0.788264
735115,0,3,8,2010-01-06,2011-01-17,200,90,0.790529
156376,3,3,3,2010-01-06,2011-01-17,200,90,0.790849


In [82]:
result_df = df.groupby(['p', 'd', 'q'])['forecast_precision'].mean().reset_index()

In [83]:
print(result_df)

       p   d   q  forecast_precision
0      0   0   0        8.253004e+00
1      0   0   1        8.212925e+00
2      0   0   2        8.176922e+00
3      0   0   3        8.144883e+00
4      0   0   4        8.112281e+00
...   ..  ..  ..                 ...
1326  10  10   6        1.881057e+15
1327  10  10   7        1.733267e+18
1328  10  10   8        6.086592e+16
1329  10  10   9        3.773867e+14
1330  10  10  10        3.849851e+15

[1331 rows x 4 columns]


In [87]:
result_df.query('p == 0 and d == 1 and q == 10')

Unnamed: 0,p,d,q,forecast_precision
21,0,1,10,4.512704


In [84]:
# Сортировка по forecast_precision и получение первых 10 строк
sorted_result = result_df.sort_values(by='forecast_precision').head(10)

# Вывод результата
print(sorted_result)

       p  d   q  forecast_precision
1228  10  1   7            4.405649
739    6  1   2            4.435207
1104   9  1   4            4.437683
989    8  1  10            4.443677
747    6  1  10            4.451062
383    3  1   9            4.455509
1227  10  1   6            4.460707
381    3  1   7            4.464016
137    1  1   5            4.465026
12     0  1   1            4.465090


In [11]:
# Фильтрация строк
desired_row = result_df.query('p == 5 and d == 3 and q == 1')

# Вывод результата
print(desired_row)

     p  d  q  forecast_precision
618  5  3  1           17.777931


In [13]:
# Фильтрация строк, где p=d=q=5
desired_row = df.query('p == 0 and d == 1 and q == 10')

# Вывод результата
print(desired_row)

        p  d   q  begin_date    end_date window_size forecast_days  \
720896  0  1  10  2010-06-29  2011-07-10         200            90   
720897  0  1  10  2010-06-30  2011-07-11         200            90   
720898  0  1  10  2010-07-01  2011-07-12         200            90   
720899  0  1  10  2010-07-02  2011-07-13         200            90   
720900  0  1  10  2010-07-06  2011-07-16         200            90   
...    .. ..  ..         ...         ...         ...           ...   
721486  0  1  10  2012-10-31  2013-11-13         200            90   
721487  0  1  10  2012-11-01  2013-11-16         200            90   
721488  0  1  10  2012-11-02  2013-11-17         200            90   
721489  0  1  10  2012-11-05  2013-11-18         200            90   
721490  0  1  10  2012-11-06  2013-11-19         200            90   

        forecast_precision  
720896            3.229521  
720897            3.416586  
720898            3.560655  
720899            3.719240  
720900        

In [52]:
a = data[data['Date'] == '2013-02-22'].index[0]

In [53]:
b = data[data['Date'] == '2012-02-10'].index[0]

In [55]:
b

3046

In [54]:
a-b

258

In [90]:
result_df.query('p == 0 and d == 1 and q == 1')

Unnamed: 0,p,d,q,forecast_precision
12,0,1,1,4.46509


In [46]:
window.tail()

Unnamed: 0,Date,Price,Open,High,Low,Vol.,Change %
3300,2013-02-15,1519.79,1521.38,1524.24,1514.14,,-0.10%
3301,2013-02-19,1530.94,1519.79,1530.94,1519.79,,0.73%
3302,2013-02-20,1511.95,1530.94,1530.94,1511.41,,-1.24%
3303,2013-02-21,1502.42,1511.95,1511.95,1497.29,,-0.63%
3304,2013-02-22,1515.6,1502.42,1515.64,1502.42,,0.88%


In [47]:
len(window)

258

In [56]:
window = data[b:b + 200]
actual_values = data['Price'][b + 200:b + 200 + 90]

In [60]:
# Создаем и обучаем модель ARIMA
model = ARIMA(window['Price'], order=(5, 3, 1))
results = model.fit()

In [61]:
# Делаем прогноз на 90 дней вперед
forecast = results.get_forecast(steps = 90)
forecast_mean = forecast.predicted_mean.values

In [57]:
len(actual_values)

90

In [58]:
len(window)

200

In [62]:
# Рассчитываем относительное отклонение в процентах (Percentage Error)
percentage_error = ((forecast_mean - actual_values) / actual_values) * 100

In [63]:
percentage_error

3246   -0.354692
3247   -0.684599
3248   -0.477881
3249    0.364205
3250    0.613890
          ...   
3331    1.662665
3332    1.365430
3333    1.915398
3334    1.388669
3335    1.141362
Name: Price, Length: 90, dtype: float64

In [64]:
np.mean(np.abs(percentage_error))

0.683362890333182

In [65]:
forecast_mean

array([1404.92909415, 1406.25642149, 1409.41234045, 1414.59331974,
       1415.68773706, 1416.58132772, 1419.35833557, 1421.36390443,
       1423.91463263, 1426.68689028, 1428.70976443, 1430.77081994,
       1433.07563469, 1435.25964899, 1437.56812861, 1439.88565137,
       1442.06775819, 1444.26037748, 1446.47624898, 1448.67056515,
       1450.88593715, 1453.09319509, 1455.27231637, 1457.44861201,
       1459.62114677, 1461.78549888, 1463.94861081, 1466.10411269,
       1468.2494635 , 1470.38881177, 1472.52160593, 1474.64765955,
       1476.76803117, 1478.88144509, 1480.98761802, 1483.08719345,
       1485.18003587, 1487.26620917, 1489.34586271, 1491.41875598,
       1493.48486432, 1495.54428167, 1497.59698228, 1499.6429925 ,
       1501.68233128, 1503.71495535, 1505.7408639 , 1507.76006964,
       1509.77256845, 1511.77836722, 1513.77746795, 1515.76986315,
       1517.75555312, 1519.73453939, 1521.70682141, 1523.67240072,
       1525.63127744, 1527.58345032, 1529.52891945, 1531.46768