# Preprocess des data yfinance 

In [10]:
import yfinance as yf 
import numpy as np
import pandas as pd
import torch
import sklearn
from sklearn.preprocessing import MinMaxScaler

In [41]:
def Vix_treatment(data_indicator):
    data_indicator = data_indicator.drop(columns = ['Adj Close','Volume'])
    
    new_col_name = {}
    for col in data_indicator.columns :
                new_col_name[col] =   'VIX'+ '_' + col
    data_indicator.rename(columns=new_col_name,inplace=True)
    return data_indicator

def avg_compute (df, mean_period, dropna = True):
    origin_df = df
    for periode in mean_period :
        
        reduced_mean = origin_df.rolling(periode).mean()
        new_col_name = {}
        
        for col in reduced_mean.columns :
            new_col_name[col] = 'avg_'+ col + '_' + str(periode)
            
        reduced_mean.rename(columns=new_col_name,inplace=True)
        
        df= pd.concat([df, reduced_mean], axis =1)
    if dropna :
        return(df.dropna())
        
    return(df)

def value_split (Values, Mixed_df, Other_data, Periodes):
    Value_df_list = {}
    scaler = MinMaxScaler()
    for val in Values :
        
        temp_df = Mixed_df.xs(val, axis=1, level=1, drop_level=True)
        temp_df.drop(columns = 'Adj Close')
        temp_df = pd.concat([temp_df, Other_data], axis =1)
        temp_df = avg_compute(temp_df,Periodes)
        scaled_features_df = scaler.fit_transform(temp_df)
        temp_df = pd.DataFrame(scaled_features_df,columns=temp_df.columns)
#         temp_df = pd.DataFrame(scaled_features_df, index = temp_df.index, columns=temp_df.columns)
        
        Value_df_list[val] = temp_df
    return Value_df_list
      


def data_fetch (Periode, Interval, Periodes, Values = ["SPY", "AAPL"] ):
    tickers_names = ""
    for val in Values :
        tickers_names += " "+val 
    
    data = yf.download(tickers = tickers_names,  # list of tickers
                period =Periode,         # time period
                interval = Interval,       # trading interval
                prepost = False,       # download pre/post market hours data?
                repair = True)  
    data_indicator = yf.download(tickers = "^VIX",  # list of tickers
                period = Periode,         # time period
                interval = Interval,       # trading interval
                prepost = False,       # download pre/post market hours data?
                repair = True) 

    data_indicator = Vix_treatment(data_indicator)
    extended_data = value_split(Values,data,data_indicator,Periodes)
    
    
    return(extended_data)
raw_data_df = data_fetch (Periode="2mo", Interval="1d", Periodes = [2,3,5]) 

[*********************100%***********************]  2 of 2 completed
^VIX: fixed 344/5042 value=0 errors in 2m price data
^VIX: fixed 687/1045 value=0 errors in 2m price data
^VIX: fixed 1017/3305 value=0 errors in 5m price data
^VIX: fixed 1032/1752 value=0 errors in 2m price data
^VIX: fixed 291/575 value=0 errors in 5m price data
^VIX: fixed 1152/1152 value=0 errors in 15m price data
^VIX: fixed 1032/1752 value=0 errors in 2m price data
^VIX: fixed 364/719 value=0 errors in 5m price data
^VIX: fixed 144/144 value=0 errors in 15m price data
^VIX: fixed 575/3611 value=0 errors in 30m price data
^VIX: fixed 6690/7458 value=0 errors in 2m price data
^VIX: fixed 512/514 value=0 errors in 2m price data
^VIX: fixed 1555/3079 value=0 errors in 5m price data
^VIX: fixed 860/869 value=0 errors in 2m price data
^VIX: fixed 105/207 value=0 errors in 5m price data
^VIX: fixed 1029/1029 value=0 errors in 15m price data
^VIX: fixed 511/3271 value=0 errors in 30m price data
^VIX: fixed 580/580 valu

In [42]:
raw_data_df

{'SPY':     Adj Close     Close      High       Low      Open    Volume  VIX_Open   
 0    0.618780  0.652687  0.570340  0.572051  0.530056  0.467120  0.266484  \
 1    0.627982  0.662392  0.684713  0.727047  0.688185  0.373996  0.201465   
 2    0.424241  0.447487  0.578727  0.528478  0.669530  0.564902  0.163919   
 3    0.445602  0.470018  0.389630  0.496110  0.490969  0.384482  0.261905   
 4    0.203741  0.214905  0.457111  0.307501  0.530945  0.584440  0.227106   
 5    0.018075  0.019065  0.139917  0.114224  0.271839  1.000000  0.594322   
 6    0.000000  0.000000  0.034313  0.000000  0.000000  0.830874  0.659341   
 7    0.209328  0.220798  0.150973  0.199191  0.257329  0.787668  0.915751   
 8    0.128817  0.135876  0.000000  0.095238  0.120818  0.912617  0.582417   
 9    0.353260  0.372617  0.266108  0.175537  0.148357  0.752738  0.855311   
 10   0.201060  0.160485  0.187190  0.245876  0.337874  0.738220  0.555861   
 11   0.324760  0.290468  0.178422  0.293184  0.266212  0

In [46]:
len_seq = 5


def split_sequence(df, n_steps):
    Input , Label = list(), list()
    for i in range(len(df)):
        # find the end of this pattern
        end_ix = i + n_steps
        # check if we are beyond the sequence
        if end_ix > len(df)-1:
            break
        # gather input and output parts of the pattern
        seq_x, seq_y = df.iloc[i:end_ix,:], df.iloc[end_ix:end_ix+1,:][["High", "Low"]]
#         seq_x, seq_y = df[i:end_ix,:], df[end_ix:end_ix+1,:][["High", "Low"]]

        Input.append(seq_x)
        Label.append(seq_y)
    return Input, Label

def data_parsing (df_dict, n_steps):
    Input_formated_data, Label_formated_data = list(), list()
    for value_df in df_dict.values():
        Input, Label = split_sequence(value_df, n_steps) 
        Input_formated_data=Input_formated_data +Input
        Label_formated_data=Label_formated_data+ Label
    return(Input_formated_data, Label_formated_data)

X, Y =  data_parsing(raw_data_df, len_seq)
print(len(X))
print(X[0].to_numpy(), Y[0].to_numpy())


66
[[0.61878037 0.65268676 0.5703399  0.57205111 0.53005625 0.46711987
  0.2664835  0.17965852 0.25415068 0.20178931 0.51011398 0.53836724
  0.42206474 0.44005309 0.36089131 0.30231529 0.39482566 0.24757286
  0.37002835 0.26566837 0.40272559 0.42675553 0.3709408  0.3938786
  0.35852698 0.33111267 0.40597127 0.27690998 0.43233995 0.32198854
  0.35449291 0.37698997 0.33811993 0.41368531 0.38141675 0.27559947
  0.45804706 0.31404668 0.5001533  0.39710577]
 [0.62798241 0.66239209 0.68471271 0.72704676 0.68818471 0.37399579
  0.2014651  0.13734228 0.29629626 0.21371782 0.62066956 0.65504561
  0.60509885 0.63481752 0.58321335 0.2522337  0.26209222 0.15736252
  0.29474429 0.21131454 0.51828229 0.54920718 0.52155689 0.51949904
  0.4752904  0.24997178 0.35401314 0.21641582 0.34782605 0.24665395
  0.40544808 0.43117867 0.3937508  0.46768315 0.42182862 0.26025987
  0.38591751 0.26366391 0.43299597 0.33971062]
 [0.42424099 0.44748703 0.57872729 0.52847832 0.66952986 0.56490173
  0.16391932 0.17817

In [48]:
class MyDataset(torch.utils.data.Dataset):
  def __init__(self , studied_val= ["SPY", "AAPL"]): # stage = phase de l'entraînement
    super().__init__()
    
    def Vix_treatment(data_indicator,studied_val):
        data_indicator = data_indicator.drop(columns = ['Adj Close','Volume'])

        new_col_name = {}
        for col in data_indicator.columns :
                    new_col_name[col] =   'VIX'+ '_' + col
        data_indicator.rename(columns=new_col_name,inplace=True)
        return data_indicator

    def avg_compute (df, mean_period, dropna = True):
        origin_df = df
        for periode in mean_period :

            reduced_mean = origin_df.rolling(periode).mean()
            new_col_name = {}

            for col in reduced_mean.columns :
                new_col_name[col] = 'avg_'+ col + '_' + str(periode)

            reduced_mean.rename(columns=new_col_name,inplace=True)

            df= pd.concat([df, reduced_mean], axis =1)
        if dropna :
            return(df.dropna())

        return(df)

    def value_split (Values, Mixed_df, Other_data, Periodes):
        Value_df_list = {}
        scaler = MinMaxScaler()
        for val in Values :

            temp_df = Mixed_df.xs(val, axis=1, level=1, drop_level=True)
            temp_df.drop(columns = 'Adj Close')
            temp_df = pd.concat([temp_df, Other_data], axis =1)
            temp_df = avg_compute(temp_df,Periodes)
            scaled_features_df = scaler.fit_transform(temp_df)
            temp_df = pd.DataFrame(scaled_features_df,columns=temp_df.columns)
    #         temp_df = pd.DataFrame(scaled_features_df, index = temp_df.index, columns=temp_df.columns)

            Value_df_list[val] = temp_df
        return Value_df_list



    def data_fetch (Periode, Interval, Periodes, Values = studied_val ):
        tickers_names = ""
        for val in Values :
            tickers_names += " "+val 

        data = yf.download(tickers = tickers_names,  # list of tickers
                    period =Periode,         # time period
                    interval = Interval,       # trading interval
                    prepost = False,       # download pre/post market hours data?
                    repair = True)  
        data_indicator = yf.download(tickers = "^VIX",  # list of tickers
                    period = Periode,         # time period
                    interval = Interval,       # trading interval
                    prepost = False,       # download pre/post market hours data?
                    repair = True) 

        data_indicator = Vix_treatment(data_indicator)
        extended_data = value_split(Values,data,data_indicator,Periodes)


        return(extended_data)
    
    raw_data_df = data_fetch (Periode="2mo", Interval="1d", Periodes = [2,3,5]) 
    
    len_seq = 5


    def split_sequence(df, n_steps):
        Input , Label = list(), list()
        for i in range(len(df)):
            # find the end of this pattern
            end_ix = i + n_steps
            # check if we are beyond the sequence
            if end_ix > len(df)-1:
                break
            # gather input and output parts of the pattern
            seq_x, seq_y = df.iloc[i:end_ix,:], df.iloc[end_ix:end_ix+1,:][["High", "Low"]]
    #         seq_x, seq_y = df[i:end_ix,:], df[end_ix:end_ix+1,:][["High", "Low"]]

            Input.append(seq_x)
            Label.append(seq_y)
        return Input, Label

    def data_parsing (df_dict, n_steps):
        Input_formated_data, Label_formated_data = list(), list()
        for value_df in df_dict.values():
            Input, Label = split_sequence(value_df, n_steps) 
            Input_formated_data=Input_formated_data +Input
            Label_formated_data=Label_formated_data+ Label
        return(Input_formated_data, Label_formated_data)

    self.Inputs, self.Labels =  data_parsing(raw_data_df, len_seq)


  def __len__(self):
    return (len(self.Inputs))
  
  def __getitem__(self,idx):
    return self.Inputs[idx].to_numpy(),self.Labels[idx].to_numpy()


In [49]:
dataset_training = MyDataset()

[*********************100%***********************]  3 of 3 completed

3 Failed downloads:
- ADJ: Period '2mo' is invalid, must be one of ['1d', '5d', '1mo', '3mo', '6mo', '1y', '2y', '5y', 'ytd', 'max']
- CLOSE: No data found, symbol may be delisted
- VOLUME: No data found, symbol may be delisted
^VIX: fixed 344/5042 value=0 errors in 2m price data
^VIX: fixed 687/1045 value=0 errors in 2m price data
^VIX: fixed 1017/3305 value=0 errors in 5m price data
^VIX: fixed 1032/1752 value=0 errors in 2m price data
^VIX: fixed 291/575 value=0 errors in 5m price data
^VIX: fixed 1152/1152 value=0 errors in 15m price data
^VIX: fixed 1032/1752 value=0 errors in 2m price data
^VIX: fixed 364/719 value=0 errors in 5m price data
^VIX: fixed 144/144 value=0 errors in 15m price data
^VIX: fixed 575/3611 value=0 errors in 30m price data
^VIX: fixed 6691/7465 value=0 errors in 2m price data
^VIX: fixed 513/521 value=0 errors in 2m price data
^VIX: fixed 1555/3080 value=0 errors in 5m price data
^VIX: fi

TypeError: MyDataset.__init__.<locals>.Vix_treatment() missing 1 required positional argument: 'studied_val'