In [1]:
def feature_selection_vif(df, target, vif_tol=4):

    ''' Conduct the automated feature selection using the Variance Inflation Factor values.

    Input:
    df (DataFrame): Dataframe with all features to be considered in feature selection.
    target (string): Name of the target variable - we exclude it from feature selection process.
    vif_tol (numeric): The VIF tolerance for the feature selection process - all variables having VIF above the vif_tol are excluded. Default value 4 - as in the original work . 

    Output:
    List of features selected within the process.

    Notes:
    The function is based on the function provided in the StackOverflow thread.
    Exact source: https://stackoverflow.com/questions/42658379/variance-inflation-factor-in-python Answer provided by user steven, on Feb 24, 2019 at 23:06.

    Moreover, to access the predictor with the highest VIF from the dictionary, we used the code proposed here: https://note.nkmk.me/en/python-dict-get-key-from-value/
    '''
    
    max_vif = 1000000 # set initial maximum VIF to be certainly above the vif_tol
    predictors = df.columns[~df.columns.isin([target])].tolist() # get initial list of considered predictors - exclude the target variable
    
    while max_vif > vif_tol: # exclude the predictor with the highest VIF (max_vif) up to the point, where VIF for all predictors is below vif_tol
         
        vif_dict = {} # initialize dictionary with VIF values
        
        for predictor in predictors: # for each predictor calculate VIF by regressing a chosen predictor on the remaining ones
            
            not_predictor = [i for i in predictors if i != predictor]
            X, y = df[not_predictor], df[predictor]

            # extract r-squared from the fit
            r_squared = LinearRegression().fit(X, y).score(X, y)

            # calculate VIF
            vif = 1/(1 - r_squared)
            vif_dict[predictor] = vif
        
        max_vif = max(vif_dict.values()) # access the maximum VIF value
        predictor_exclude = [k for k, v in vif_dict.items() if v == max_vif][0] # access the name of predictor with max_vif. Source of the method applied here: https://note.nkmk.me/en/python-dict-get-key-from-value/
        
        print("Variable {} excluded. VIF: {}".format(predictor_exclude, round(max_vif,3)))
        
        # exclude the predictor with highest VIF
        predictors.remove(predictor_exclude) 
        
    # after the feature selection show the remaining features
    print("\n Feature selection done! Finally chosen predictors: \n")
    print(predictors)
    
    return predictors

IndentationError: expected an indented block (Temp/ipykernel_12420/1710631959.py, line 11)