## Общие методы для обработки данных
В данном разделе будут находится общие методы.

In [63]:
import pandas as pd

# читаем данные и создаём таблицу
def data_loader(path: str):
    return pd.read_csv(path)

# преобразуем данные так, как нам необходимо
def data_type_and_date_transform(data: pd.DataFrame) ->pd.DataFrame:
    data = data.replace(",", "", regex=True)
    data.Date = pd.to_datetime(data.Date)
    data = data.sort_values(by=["Date"])
    data.set_index('Date')
    data.Price = data.Price.astype(float)
    data.High = data.High.astype(float)
    data.Low = data.Low.astype(float)
    data.Open = data.Open.astype(float)
    data["Vol."] = data["Vol."].astype(float)
    return data

# Обрезает данные по дате. 
# Важно, чтобы даты в датафрейме были в том же формате что и начальная и конечная дата
def cut_by_date(data: pd.DataFrame, begin_date: str, end_date: str) -> pd.DataFrame:
     result = data[(data['Date'] > begin_date) & (data['Date'] < end_date)]
     result.index = data.index[:len(result)]
     return result

## Смещение данных с запоминанием их реального положения
Поскольку нам необходимы данные без пропущенных дней, то один из вариантов это смещение данных с запоминанием реальной даты. В данном случае все дни будут идти без пропусков.

In [50]:
# Считываем данные
general_data = data_loader("../../Data/Day/S&P 500 Historical Data00-20.csv")
general_data = data_type_and_date_transform(general_data)

In [53]:
# Необходимо развернуть данные и исправить индексы
general_data.index =general_data.index[::-1]

In [57]:
# Проверяем что с данными всё ок
general_data.head()

Unnamed: 0,Date,Price,Open,High,Low,Vol.,Change %
0,2000-01-03,1455.2,1469.2,1478.0,1438.4,,-0.95%
1,2000-01-04,1399.4,1455.2,1455.2,1397.4,,-3.83%
2,2000-01-05,1402.1,1399.4,1413.3,1377.7,,0.19%
3,2000-01-06,1403.5,1402.1,1411.9,1392.0,,0.10%
4,2000-01-07,1441.5,1403.5,1441.5,1400.5,,2.71%


In [64]:
# Производим обрезку данных 
special_data = cut_by_date(general_data, "2010-01-01", "2014-01-01")

In [66]:
# Проверяем получилось ли сделать обрезку
special_data

Unnamed: 0,Date,Price,Open,High,Low,Vol.,Change %
0,2010-01-04,1132.99,1116.56,1133.87,1116.56,,1.60%
1,2010-01-05,1136.52,1132.66,1136.63,1129.66,,0.31%
2,2010-01-06,1137.14,1135.71,1139.19,1133.95,,0.05%
3,2010-01-07,1141.69,1136.27,1142.46,1131.32,,0.40%
4,2010-01-08,1144.98,1140.52,1145.39,1136.22,,0.29%
...,...,...,...,...,...,...,...
1001,2013-12-24,1833.32,1828.02,1833.32,1828.02,,0.29%
1002,2013-12-26,1842.02,1834.96,1842.84,1834.96,,0.47%
1003,2013-12-27,1841.40,1842.97,1844.89,1839.81,,-0.03%
1004,2013-12-30,1841.07,1841.47,1842.47,1838.77,,-0.02%


In [67]:
special_data['new_index'] = special_data.Price
special_data['new_date'] = special_data.Price
special_data.head()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  special_data['new_index'] = special_data.Price
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  special_data['new_date'] = special_data.Price


Unnamed: 0,Date,Price,Open,High,Low,Vol.,Change %,new_index,new_date
0,2010-01-04,1132.99,1116.56,1133.87,1116.56,,1.60%,1132.99,1132.99
1,2010-01-05,1136.52,1132.66,1136.63,1129.66,,0.31%,1136.52,1136.52
2,2010-01-06,1137.14,1135.71,1139.19,1133.95,,0.05%,1137.14,1137.14
3,2010-01-07,1141.69,1136.27,1142.46,1131.32,,0.40%,1141.69,1141.69
4,2010-01-08,1144.98,1140.52,1145.39,1136.22,,0.29%,1144.98,1144.98


In [86]:
import datetime
import calendar
import time
# Получаем значения нчального дня(в тестовом варианте это значение получается равным 1262563200)
special_data.new_index[0] = calendar.timegm(time.strptime(str(special_data.Date[0])[0:10], '%Y-%m-%d')) 
special_data.new_date[0] = datetime.datetime.fromtimestamp(special_data.new_index[0]).strftime('%Y-%m-%d %H:%M:%S')
for i in range(len(special_data.new_index)):
    if i > 0:
        special_data.new_index[i] = special_data.new_index[i - 1] + 86400
        special_data.new_date[i] = datetime.datetime.fromtimestamp(special_data.new_index[i]).strftime('%Y-%m-%d %H:%M:%S')

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  special_data.new_index[0] = calendar.timegm(time.strptime(str(special_data.Date[0])[0:10], '%Y-%m-%d'))
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  special_data.new_date[0] = datetime.datetime.fromtimestamp(special_data.new_index[0]).strftime('%Y-%m-%d %H:%M:%S')
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  special_data.new_index[i] = special_data.new_index[i - 1] + 86400
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the 

In [87]:
#change indexes
special_data.set_index('new_date', inplace=True)

In [89]:
# Смотрим результат
special_data.head(20)

Unnamed: 0_level_0,Date,Price,Open,High,Low,Vol.,Change %,new_index
new_date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
2010-01-04 07:00:00,2010-01-04,1132.99,1116.56,1133.87,1116.56,,1.60%,1262563000.0
2010-01-05 07:00:00,2010-01-05,1136.52,1132.66,1136.63,1129.66,,0.31%,1262650000.0
2010-01-06 07:00:00,2010-01-06,1137.14,1135.71,1139.19,1133.95,,0.05%,1262736000.0
2010-01-07 07:00:00,2010-01-07,1141.69,1136.27,1142.46,1131.32,,0.40%,1262822000.0
2010-01-08 07:00:00,2010-01-08,1144.98,1140.52,1145.39,1136.22,,0.29%,1262909000.0
2010-01-09 07:00:00,2010-01-11,1146.98,1145.96,1149.74,1142.02,,0.17%,1262995000.0
2010-01-10 07:00:00,2010-01-12,1136.22,1143.81,1143.81,1131.77,,-0.94%,1263082000.0
2010-01-11 07:00:00,2010-01-13,1145.68,1137.31,1148.4,1133.18,,0.83%,1263168000.0
2010-01-12 07:00:00,2010-01-14,1148.46,1145.68,1150.41,1143.8,,0.24%,1263254000.0
2010-01-13 07:00:00,2010-01-15,1136.03,1147.72,1147.77,1131.39,,-1.08%,1263341000.0


## Добавление в выходные дни цены предыдущего дня
Другой вариант создания данных без промежутков. В данном случае при отсутствие следующего убирается пустой промежуток по формуле day_i = (day_prev_work + day_next_work) /2

In [151]:
# Считываем данные
general_data = data_loader("../../Data/Day/S&P 500 Historical Data00-20.csv")
general_data = data_type_and_date_transform(general_data)

In [152]:
# Необходимо развернуть данные и исправить индексы
general_data.index = general_data.index[::-1]

In [153]:
# Проверяем что с данными всё ок
general_data.head()

Unnamed: 0,Date,Price,Open,High,Low,Vol.,Change %
0,2000-01-03,1455.2,1469.2,1478.0,1438.4,,-0.95%
1,2000-01-04,1399.4,1455.2,1455.2,1397.4,,-3.83%
2,2000-01-05,1402.1,1399.4,1413.3,1377.7,,0.19%
3,2000-01-06,1403.5,1402.1,1411.9,1392.0,,0.10%
4,2000-01-07,1441.5,1403.5,1441.5,1400.5,,2.71%


In [154]:
# Производим обрезку данных 
special_data = cut_by_date(general_data, "2010-01-01", "2014-01-01")

In [155]:
# Проверяем получилось ли сделать обрезку
special_data

Unnamed: 0,Date,Price,Open,High,Low,Vol.,Change %
0,2010-01-04,1132.99,1116.56,1133.87,1116.56,,1.60%
1,2010-01-05,1136.52,1132.66,1136.63,1129.66,,0.31%
2,2010-01-06,1137.14,1135.71,1139.19,1133.95,,0.05%
3,2010-01-07,1141.69,1136.27,1142.46,1131.32,,0.40%
4,2010-01-08,1144.98,1140.52,1145.39,1136.22,,0.29%
...,...,...,...,...,...,...,...
1001,2013-12-24,1833.32,1828.02,1833.32,1828.02,,0.29%
1002,2013-12-26,1842.02,1834.96,1842.84,1834.96,,0.47%
1003,2013-12-27,1841.40,1842.97,1844.89,1839.81,,-0.03%
1004,2013-12-30,1841.07,1841.47,1842.47,1838.77,,-0.02%


In [156]:
day = 86400
data_len = len(special_data.Date)
for i in range(1, data_len):
    day_before = calendar.timegm(time.strptime(str(special_data.Date[i-1])[0:10], '%Y-%m-%d'))
    this_day = calendar.timegm(time.strptime(str(special_data.Date[i])[0:10], '%Y-%m-%d'))
    if(day_before + day == this_day):
        continue
    # we need to add some new days
    while(True):
        temp_row = {'Date': [datetime.datetime.fromtimestamp(day_before + day).strftime('%Y-%m-%d %H:%M:%S')],  
                                            'Price': [special_data.Price[i-1]],
                                            'Open': [special_data.High[i-1]],
                                            'Low': [special_data.Low[i-1]],
                                            'Vol.': [special_data['Vol.'][i-1]],
                                            'Change %': [special_data['Change %'][i-1]]}
        special_data = pd.concat([special_data, pd.DataFrame.from_records(temp_row)], ignore_index=True)
        day_before = calendar.timegm(time.strptime(str(temp_row['Date'][0])[0:10], '%Y-%m-%d'))
        if(day_before + day >= this_day):
            break


In [167]:
special_data.to_csv('test.csv')

In [179]:
# Добавляем столбец Даты в long. Сортируем по данному столбцу и удаляем данный столбец
special_data['new_index'] = special_data.Price
for i in range(len(special_data.new_index)):
    special_data.new_index[i] = calendar.timegm(time.strptime(str(special_data.Date[i])[0:10], '%Y-%m-%d'))
temp = special_data.sort_values(by=['new_index'])
special_data = temp.drop(columns=['new_index'])

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  special_data.new_index[i] = calendar.timegm(time.strptime(str(special_data.Date[i])[0:10], '%Y-%m-%d'))


In [180]:
special_data

Unnamed: 0,Date,Price,Open,High,Low,Vol.,Change %
0,2010-01-04 00:00:00,1132.99,1116.56,1133.87,1116.56,,1.60%
1,2010-01-05 00:00:00,1136.52,1132.66,1136.63,1129.66,,0.31%
2,2010-01-06 00:00:00,1137.14,1135.71,1139.19,1133.95,,0.05%
3,2010-01-07 00:00:00,1141.69,1136.27,1142.46,1131.32,,0.40%
4,2010-01-08 00:00:00,1144.98,1140.52,1145.39,1136.22,,0.29%
...,...,...,...,...,...,...,...
1003,2013-12-27 00:00:00,1841.40,1842.97,1844.89,1839.81,,-0.03%
1456,2013-12-28 07:00:00,1841.40,1844.89,,1839.81,,-0.03%
1457,2013-12-29 07:00:00,1841.40,1844.89,,1839.81,,-0.03%
1004,2013-12-30 00:00:00,1841.07,1841.47,1842.47,1838.77,,-0.02%


Полученный результат имеет сбитый индекс, но поскольку при прогназировании нас интересует дата вместо индекса, то это не критично.

## Анализ кол-ва выходных дней и добавление в выходные дни цены, изменяющейся от цены рабочего дня, к цене выходного дня

Нужно реализовать