In [None]:
def create_lag_features(df, feature_name, num_lags = 1):
    '''
    Creating lag features for the specified field.
    Inputs:
        df: dataframe (sorted by date)
        feature_name: name of the feature to create the lag feature
        num_lags: number of lag features need to be created (default is 1). If more than 1, the specified num of features would be created

    Output:
        dataframe with the created lag features
    '''
    for i in range(num_lags):
        df[f'lag_{feature_name}_{i+1}'] = df.groupby(['item_dept', 'store'])[feature_name].shift(i+1)

    return df


In [None]:
def create_rolling_window_features(df, feature_name, window_size = 2, std_dev = True, use_lag = True):
    '''
    Creating rolling window features (mean and, if required, standard deviation) of a feature for a specified window size
    Inputs:
        df: dataframe (sorted by date)
        feature_name: name of the feature to create the rolling window features
        window_size: the length of the previous time steps to consider to create the feature
        std_dev: whether to create the standard deviation feature as well (default is True)
        use_lag: whether to use lag feature to create the feature (default is True). Uses just one lag

    return:
        dataframe with the initial features and the created rolling window features
    '''
    if use_lag:
        df[f'rolling_mean_{feature_name}_{window_size}'] = df.groupby(['item_dept', 'store'])[f'lag_{feature_name}_1'].transform(lambda x: x.rolling(window=window_size).mean())

        if std_dev:
            df[f'rolling_std_{feature_name}_{window_size}'] = df.groupby(['item_dept', 'store'])[f'lag_{feature_name}_1'].transform(lambda x: x.rolling(window=window_size).std())

        return df
    else:
        print("Function not yet designed to use without lag features")
        return None



In [None]:
def create_cumulative_features(df, feature_name, cum_mean = True, cum_sum = True, use_lag = True):
    '''
    Creating cumulative features for the specified field.
    Inputs:
        df: dataframe (sorted by date)
        feature_name: name of the feature to create the cumulative feature
        cum_mean: whether the cumulative mean is required
        cum_sum: whether the cumulative sum is required
        use_lag: whether to use lag feature to create the feature (default is True). Uses just one lag
    
        Output:
            dataframe with the create cumulative feature
    '''
    if use_lag:
        if cum_sum:
            df[f'cumsum_{feature_name}'] = df.groupby(['item_dept', 'store'])[f'lag_{feature_name}_1'].cumsum()
        if cum_mean:
            df[f'cummean_{feature_name}'] = df.groupby(['item_dept', 'store'])[f'lag_{feature_name}_1'].expanding().mean().reset_index(drop=True)
        
        if cum_mean == False and cum_sum == False:
            print("At least one parameter (cum_sum or cum_mean) should be True")
            return None
        return df
    else:
        print("Function not yet designed to use without lag features")
        return None


In [None]:
def create_time_based_features(df, date_col):
    import holidays
    '''
    Function to create time based features
    Inputs:
        df: dataframe with date column
        date_col: specify the date column (make sure it is in datetime format)

    Output:
        dataframe with created features based time
    '''
    # Day of the week (0=Monday, 6=Sunday)
    df['day_of_week'] = df[date_col].dt.dayofweek

    # Is weekend (1=Weekend, 0=Weekday)
    df['isWeekend'] = df[date_col].dt.dayofweek >= 5

    #Is holiday (1=Holiday, 0=No Holiday). Assuming this store is in US
    us_holidays = holidays.US(years=[2021, 2022])
    df['Is_Holiday'] = df[date_col].apply(lambda x: x in us_holidays).astype(int)

    return df

In [3]:
for i in range(3):
    print(i+1)

1
2
3


In [4]:
'use'+'_'+'lag'

'use_lag'