In [1]:
%%javascript
document.title='Jupyter Lab - Oil prices.ipynb'

<IPython.core.display.Javascript object>

In [156]:
from glob import glob
from tqdm import tqdm  
import pandas as pd
import numpy as np
import os
 

In [111]:
def freq_string_generator(date_dict):
    freq_map = {"seconds" : 'S', 'minutes' : 'T', 'days': 'D', 'months': 'M'}
    freq_string = ''
    
    for k in freq_map:
        if k in date_dict:
            freq_string += str(date_dict[k]) + freq_map[k]
    
    return freq_string

In [57]:
%%time
if os.path.exists('oil_dataset.csv'):
    print('Loading dataset from csv')
    df = pd.read_csv('oil_dataset.csv', index_col=0)
    df.index = pd.to_datetime(df.index)
else:
    DATASET_PATH = './data'
    files = [file for file in glob(DATASET_PATH + '/*') if '.' not in file.split('/')[-1]]
    df = []
    for file in tqdm(files):
        df_ = pd.read_csv(file, header=None)
        df_.columns = [0,1,'price','volume']
        df_['date'] = pd.to_datetime(df_[0] + ' ' + df_[1])
        df_ = df_[['date','price','volume']].set_index('date')
        df.append(df_)
    df = pd.concat(df).sort_index()
    df.to_csv('oil_dataset.csv')

Loading dataset from csv
CPU times: user 2min 56s, sys: 2min 54s, total: 5min 51s
Wall time: 7min 44s


In [168]:
df

Unnamed: 0_level_0,price,volume
date,Unnamed: 1_level_1,Unnamed: 2_level_1
2010-01-03 18:00:00,79.63,3
2010-01-03 18:00:00,79.63,4
2010-01-03 18:00:00,79.63,3
2010-01-03 18:00:00,79.63,2
2010-01-03 18:00:00,79.63,2
...,...,...
2020-05-19 14:00:26.065,31.73,2
2020-05-19 14:00:26.069,31.73,1
2020-05-19 14:00:26.072,31.73,1
2020-05-19 14:00:26.075,31.73,1


### I take small subset to test things out

In [121]:
df_test = df.iloc[:df.shape[0] // 50]
df_test.index = pd.to_datetime(df_test.index).to_period('S')
df_test = df_test.loc[:,['price']]
df_test

Unnamed: 0_level_0,price
date,Unnamed: 1_level_1
2010-01-03 18:00:00,79.63
2010-01-03 18:00:00,79.63
2010-01-03 18:00:00,79.63
2010-01-03 18:00:00,79.63
2010-01-03 18:00:00,79.63
...,...
2010-04-01 12:20:56,84.90
2010-04-01 12:20:56,84.89
2010-04-01 12:20:56,84.89
2010-04-01 12:20:56,84.89


In [166]:
def resample(df, freq_dict, delta_threshold = 0.005):
    
    if isinstance(freq_dict, str):
        freq_string = freq_dict
    else:
        freq_map = {"seconds" : 'S', 'minutes' : 'T', 'days': 'D', 'months': 'M'}
        freq_string = ''

        for k in freq_map:
            if k in freq_dict and freq_dict[k]:
                freq_string += str(freq_dict[k]) + freq_map[k]
            
    resampled = df.resample(freq_string).agg(['first', 'last'])
    resampled.columns = resampled.columns.droplevel(0)
    resampled['delta'] = resampled['last']/resampled['first'] - 1
    resampled['delta_next_day'] = resampled['delta'].shift(-1)
    resampled = resampled.iloc[:-1]
    resampled = resampled.loc[np.abs(resampled['delta']) > 0.95*delta_threshold]
            
    return resampled
    

### Below resampled dataset with 1 minute frequency and filtered out by 0.005 price change

I took original data, resampled it to 1 minutes periods, get the first and the last entry in each period and take the difference between them.
Delta next day is the delta calculated the same way but for the next period.

In [167]:
resample(df_test,dict(seconds=0, minutes=1), delta_threshold=0.005)

Unnamed: 0_level_0,first,last,delta,delta_next_day
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
2010-01-06 10:30,82.16,81.49,-0.008155,-0.000736
2010-01-06 11:21,81.95,82.67,0.008786,-0.003145
2010-01-12 16:30,80.55,80.16,-0.004842,-0.001123
2010-01-13 10:30,79.06,78.47,-0.007463,0.002039
2010-01-21 11:09,77.12,76.49,-0.008169,0.0
2010-01-27 10:32,74.57,74.13,-0.0059,-0.00027
2010-01-27 13:17,73.95,73.51,-0.00595,-0.001904
2010-01-29 11:59,73.7,73.3,-0.005427,0.002321
2010-02-03 14:26,76.61,77.08,0.006135,-0.000649
2010-02-05 08:30,72.62,73.11,0.006747,-0.006974
