## Addressing missing data issues

In [1]:
import numpy as np
import pandas as pd
from IPython.display import display
from openbb import obb

In [2]:
obb.user.preferences.output_type = "dataframe"

Fetches historical price data for the equity "AAPL" from 2020-07-01 to 2023-07-06 using the "yfinance" provider and stores it in 'df'

In [3]:
df = obb.equity.price.historical(
    "AAPL",
    start_date="2020-07-01",
    end_date="2023-07-06",
    provider="yfinance",
)

In [4]:
display(df)

Unnamed: 0_level_0,open,high,low,close,volume,split_ratio,dividend
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
2020-07-01,91.279999,91.839996,90.977501,91.027496,110737200,0.0,0.0
2020-07-02,91.962502,92.617500,90.910004,91.027496,114041600,0.0,0.0
2020-07-06,92.500000,93.945000,92.467499,93.462502,118655600,0.0,0.0
2020-07-07,93.852501,94.654999,93.057503,93.172501,112424400,0.0,0.0
2020-07-08,94.180000,95.375000,94.089996,95.342499,117092000,0.0,0.0
...,...,...,...,...,...,...,...
2023-06-29,189.080002,190.070007,188.940002,189.589996,46347300,0.0,0.0
2023-06-30,191.630005,194.479996,191.259995,193.970001,85069600,0.0,0.0
2023-07-03,193.779999,193.880005,191.759995,192.460007,31458200,0.0,0.0
2023-07-05,191.570007,192.979996,190.619995,191.330002,46920300,0.0,0.0


Generates a date range from the minimum to maximum dates in 'df' with daily frequency and stores it in 'calendar_dates'

In [5]:
calendar_dates = pd.date_range(start=df.index.min(), end=df.index.max(), freq="D")

In [6]:
display(calendar_dates)

DatetimeIndex(['2020-07-01', '2020-07-02', '2020-07-03', '2020-07-04',
               '2020-07-05', '2020-07-06', '2020-07-07', '2020-07-08',
               '2020-07-09', '2020-07-10',
               ...
               '2023-06-27', '2023-06-28', '2023-06-29', '2023-06-30',
               '2023-07-01', '2023-07-02', '2023-07-03', '2023-07-04',
               '2023-07-05', '2023-07-06'],
              dtype='datetime64[ns]', length=1101, freq='D')

Reindexes 'df' to the 'calendar_dates', introducing missing values for non-trading days, and stores it in 'calendar_prices'

In [7]:
calendar_prices = df.reindex(calendar_dates)

In [8]:
display(calendar_prices)

Unnamed: 0,open,high,low,close,volume,split_ratio,dividend
2020-07-01,91.279999,91.839996,90.977501,91.027496,110737200.0,0.0,0.0
2020-07-02,91.962502,92.617500,90.910004,91.027496,114041600.0,0.0,0.0
2020-07-03,,,,,,,
2020-07-04,,,,,,,
2020-07-05,,,,,,,
...,...,...,...,...,...,...,...
2023-07-02,,,,,,,
2023-07-03,193.779999,193.880005,191.759995,192.460007,31458200.0,0.0,0.0
2023-07-04,,,,,,,
2023-07-05,191.570007,192.979996,190.619995,191.330002,46920300.0,0.0,0.0


Backfills missing values in 'calendar_prices' and stores the result in 'df_1'

In [9]:
df_1 = calendar_prices.bfill()

In [10]:
display(df_1)

Unnamed: 0,open,high,low,close,volume,split_ratio,dividend
2020-07-01,91.279999,91.839996,90.977501,91.027496,110737200.0,0.0,0.0
2020-07-02,91.962502,92.617500,90.910004,91.027496,114041600.0,0.0,0.0
2020-07-03,92.500000,93.945000,92.467499,93.462502,118655600.0,0.0,0.0
2020-07-04,92.500000,93.945000,92.467499,93.462502,118655600.0,0.0,0.0
2020-07-05,92.500000,93.945000,92.467499,93.462502,118655600.0,0.0,0.0
...,...,...,...,...,...,...,...
2023-07-02,193.779999,193.880005,191.759995,192.460007,31458200.0,0.0,0.0
2023-07-03,193.779999,193.880005,191.759995,192.460007,31458200.0,0.0,0.0
2023-07-04,191.570007,192.979996,190.619995,191.330002,46920300.0,0.0,0.0
2023-07-05,191.570007,192.979996,190.619995,191.330002,46920300.0,0.0,0.0


Forward fills missing values in 'calendar_prices' and stores the result in 'df_1'

In [11]:
df_1 = calendar_prices.ffill()

In [12]:
display(df_1)

Unnamed: 0,open,high,low,close,volume,split_ratio,dividend
2020-07-01,91.279999,91.839996,90.977501,91.027496,110737200.0,0.0,0.0
2020-07-02,91.962502,92.617500,90.910004,91.027496,114041600.0,0.0,0.0
2020-07-03,91.962502,92.617500,90.910004,91.027496,114041600.0,0.0,0.0
2020-07-04,91.962502,92.617500,90.910004,91.027496,114041600.0,0.0,0.0
2020-07-05,91.962502,92.617500,90.910004,91.027496,114041600.0,0.0,0.0
...,...,...,...,...,...,...,...
2023-07-02,191.630005,194.479996,191.259995,193.970001,85069600.0,0.0,0.0
2023-07-03,193.779999,193.880005,191.759995,192.460007,31458200.0,0.0,0.0
2023-07-04,193.779999,193.880005,191.759995,192.460007,31458200.0,0.0,0.0
2023-07-05,191.570007,192.979996,190.619995,191.330002,46920300.0,0.0,0.0


Reindexes 'df' to the 'calendar_dates' and performs linear interpolation to fill missing values, storing the result in 'linear'

In [13]:
calendar_prices = df.reindex(calendar_dates)
linear = calendar_prices.interpolate(method="linear")

In [14]:
display(linear)

Unnamed: 0,open,high,low,close,volume,split_ratio,dividend
2020-07-01,91.279999,91.839996,90.977501,91.027496,1.107372e+08,0.0,0.0
2020-07-02,91.962502,92.617500,90.910004,91.027496,1.140416e+08,0.0,0.0
2020-07-03,92.096876,92.949375,91.299377,91.636248,1.151951e+08,0.0,0.0
2020-07-04,92.231251,93.281250,91.688751,92.244999,1.163486e+08,0.0,0.0
2020-07-05,92.365625,93.613125,92.078125,92.853750,1.175021e+08,0.0,0.0
...,...,...,...,...,...,...,...
2023-07-02,193.063334,194.080002,191.593328,192.963338,4.932867e+07,0.0,0.0
2023-07-03,193.779999,193.880005,191.759995,192.460007,3.145820e+07,0.0,0.0
2023-07-04,192.675003,193.430000,191.189995,191.895004,3.918925e+07,0.0,0.0
2023-07-05,191.570007,192.979996,190.619995,191.330002,4.692030e+07,0.0,0.0


Reindexes 'df' to the 'calendar_dates' and performs cubic spline interpolation to fill missing values, storing the result in 'cubic'

In [15]:
calendar_prices = df.reindex(calendar_dates)
cubic = calendar_prices.interpolate(method="cubicspline")

In [16]:
display(cubic)

Unnamed: 0,open,high,low,close,volume,split_ratio,dividend
2020-07-01,91.279999,91.839996,90.977501,91.027496,1.107372e+08,0.000000e+00,0.000000e+00
2020-07-02,91.962502,92.617500,90.910004,91.027496,1.140416e+08,0.000000e+00,0.000000e+00
2020-07-03,91.905178,93.018261,91.109728,91.804272,1.181507e+08,3.592598e-24,1.363686e-15
2020-07-04,91.628604,93.240770,91.492200,92.824548,1.213994e+08,7.185197e-24,2.727373e-15
2020-07-05,91.653353,93.483519,91.972948,93.555050,1.221227e+08,7.185197e-24,2.727373e-15
...,...,...,...,...,...,...,...
2023-07-02,194.071253,195.513330,192.326285,194.488423,5.806444e+07,0.000000e+00,-2.115795e-21
2023-07-03,193.779999,193.880005,191.759995,192.460007,3.145820e+07,0.000000e+00,0.000000e+00
2023-07-04,192.906895,193.224617,191.295466,191.403875,3.331034e+07,0.000000e+00,4.882604e-22
2023-07-05,191.570007,192.979996,190.619995,191.330002,4.692030e+07,0.000000e+00,0.000000e+00


**Jason Strimpel** is the founder of <a href='https://pyquantnews.com/'>PyQuant News</a> and co-founder of <a href='https://www.tradeblotter.io/'>Trade Blotter</a>. His career in algorithmic trading spans 20+ years. He previously traded for a Chicago-based hedge fund, was a risk manager at JPMorgan, and managed production risk technology for an energy derivatives trading firm in London. In Singapore, he served as APAC CIO for an agricultural trading firm and built the data science team for a global metals trading firm. Jason holds degrees in Finance and Economics and a Master's in Quantitative Finance from the Illinois Institute of Technology. His career spans America, Europe, and Asia. He shares his expertise through the <a href='https://pyquantnews.com/subscribe-to-the-pyquant-newsletter/'>PyQuant Newsletter</a>, social media, and has taught over 1,000+ algorithmic trading with Python in his popular course **<a href='https://gettingstartedwithpythonforquantfinance.com/'>Getting Started With Python for Quant Finance</a>**. All code is for educational purposes only. Nothing provided here is financial advise. Use at your own risk.