Forward Metod

In [1]:
import pandas as pd
import numpy as np
import statsmodels.api as sm


data = {
    "col_1": [44, 56, 70, 50,44,75,35,48,33,35],
    "col_2": [1.3, 2, 1.7, 1.5,1.6,1.2,1.6,1.4,1,1.1],
    "col_3": [9,7,5,14,15,12,6,10,15,21],
    "col_4":[10, 6, 5, 12,10,15,5,12,17,20],
    "col_5":[1.5, 1, 1.7, 1.8,2,1,1.9,1.2,0.8,0.9]
}

df = pd.DataFrame(data)

X = df.drop(columns=["col_4"])
y = df["col_4"]

def forward_subset_selection(X, y, significance_level=0.05):
    remaining_features = list(X.columns)
    selected_features = []
    
    while len(remaining_features) > 0:
        best_p_value = float("inf")
        best_feature = None
        
        for feature in remaining_features:
            model_features = selected_features + [feature]
            X_subset = X[model_features]
            model = sm.OLS(y, sm.add_constant(X_subset)).fit()
            p_value = model.pvalues[feature]
            
            if p_value < best_p_value:
                best_p_value = p_value
                best_feature = feature
        
        if best_p_value <= significance_level:
            selected_features.append(best_feature)
            remaining_features.remove(best_feature)
        else:
            break
    
    return selected_features

selected_features = forward_subset_selection(X, y)
print(selected_features)

['col_3', 'col_2', 'col_5']


In [2]:
model = sm.OLS(y, sm.add_constant(X[selected_features]))
model.fit().summary()



0,1,2,3
Dep. Variable:,col_4,R-squared:,0.985
Model:,OLS,Adj. R-squared:,0.977
Method:,Least Squares,F-statistic:,130.3
Date:,"Mon, 28 Aug 2023",Prob (F-statistic):,7.51e-06
Time:,20:48:30,Log-Likelihood:,-8.9843
No. Observations:,10,AIC:,25.97
Df Residuals:,6,BIC:,27.18
Df Model:,3,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
const,17.8719,2.233,8.003,0.000,12.407,23.336
col_3,0.5711,0.068,8.385,0.000,0.404,0.738
col_2,-6.3334,1.193,-5.308,0.002,-9.253,-3.414
col_5,-2.9436,0.642,-4.585,0.004,-4.515,-1.373

0,1,2,3
Omnibus:,2.243,Durbin-Watson:,3.06
Prob(Omnibus):,0.326,Jarque-Bera (JB):,1.19
Skew:,0.543,Prob(JB):,0.552
Kurtosis:,1.705,Cond. No.,127.0


In [3]:
model = sm.OLS(y, sm.add_constant(X))
model.fit().summary()



0,1,2,3
Dep. Variable:,col_4,R-squared:,0.992
Model:,OLS,Adj. R-squared:,0.985
Method:,Least Squares,F-statistic:,147.1
Date:,"Mon, 28 Aug 2023",Prob (F-statistic):,2.27e-05
Time:,20:48:31,Log-Likelihood:,-6.0619
No. Observations:,10,AIC:,22.12
Df Residuals:,5,BIC:,23.64
Df Model:,4,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
const,15.9796,2.059,7.762,0.001,10.688,21.271
col_1,0.0326,0.016,1.993,0.103,-0.009,0.075
col_2,-6.6182,0.986,-6.710,0.001,-9.153,-4.083
col_3,0.6059,0.058,10.380,0.000,0.456,0.756
col_5,-2.7211,0.537,-5.069,0.004,-4.101,-1.341

0,1,2,3
Omnibus:,4.111,Durbin-Watson:,2.851
Prob(Omnibus):,0.128,Jarque-Bera (JB):,2.015
Skew:,1.098,Prob(JB):,0.365
Kurtosis:,2.89,Cond. No.,570.0


Backward Metod

In [4]:
def backward_subset_selection(X, y, significance_level=0.05):
    selected_columns = list(X.columns)
    num_features = len(selected_columns)

    while True:
        X_subset = X[selected_columns]
        model = sm.OLS(y, sm.add_constant(X_subset)).fit()
        p_values = model.pvalues[1:] 

        max_p_value = p_values.max()
        if max_p_value > significance_level:
            idx_to_remove = p_values.idxmax()
            selected_columns.remove(idx_to_remove)
            num_features -= 1
        else:
            break

    return selected_columns

selected_features = backward_subset_selection(X, y)
print(selected_features)

['col_2', 'col_3', 'col_5']


In [5]:
model = sm.OLS(y, sm.add_constant(X[selected_features]))
model.fit().summary()



0,1,2,3
Dep. Variable:,col_4,R-squared:,0.985
Model:,OLS,Adj. R-squared:,0.977
Method:,Least Squares,F-statistic:,130.3
Date:,"Mon, 28 Aug 2023",Prob (F-statistic):,7.51e-06
Time:,20:48:34,Log-Likelihood:,-8.9843
No. Observations:,10,AIC:,25.97
Df Residuals:,6,BIC:,27.18
Df Model:,3,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
const,17.8719,2.233,8.003,0.000,12.407,23.336
col_2,-6.3334,1.193,-5.308,0.002,-9.253,-3.414
col_3,0.5711,0.068,8.385,0.000,0.404,0.738
col_5,-2.9436,0.642,-4.585,0.004,-4.515,-1.373

0,1,2,3
Omnibus:,2.243,Durbin-Watson:,3.06
Prob(Omnibus):,0.326,Jarque-Bera (JB):,1.19
Skew:,0.543,Prob(JB):,0.552
Kurtosis:,1.705,Cond. No.,127.0


Stepwise metod

In [8]:
def stepwise_subset_selection(X, y, significance_level_entry=0.05, significance_level_exit=0.1):
    remaining_features = list(X.columns)
    selected_features = []
    
    while len(remaining_features) > 0:
        best_p_value = float('inf')
        best_feature = None
        current_model = sm.OLS(y, sm.add_constant(X[selected_features])).fit()
        current_best_p_value = current_model.pvalues.max()
        
        for feature in remaining_features:
            model_features = selected_features + [feature]
            X_subset = X[model_features]
            model = sm.OLS(y, sm.add_constant(X_subset)).fit()
            p_value = model.pvalues[feature]
            
            if p_value < best_p_value:
                best_p_value = p_value
                best_feature = feature
        
        if best_p_value <= significance_level_entry:
            selected_features.append(best_feature)
            remaining_features.remove(best_feature)
        elif current_best_p_value > significance_level_exit:
            break
        else:
            break
    
    return selected_features

selected_features = stepwise_subset_selection(X, y)
print(selected_features)

['col_3', 'col_2', 'col_5']


In [7]:
model = sm.OLS(y, sm.add_constant(X[selected_features]))
model.fit().summary()



0,1,2,3
Dep. Variable:,col_4,R-squared:,0.985
Model:,OLS,Adj. R-squared:,0.977
Method:,Least Squares,F-statistic:,130.3
Date:,"Mon, 28 Aug 2023",Prob (F-statistic):,7.51e-06
Time:,20:48:35,Log-Likelihood:,-8.9843
No. Observations:,10,AIC:,25.97
Df Residuals:,6,BIC:,27.18
Df Model:,3,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
const,17.8719,2.233,8.003,0.000,12.407,23.336
col_3,0.5711,0.068,8.385,0.000,0.404,0.738
col_2,-6.3334,1.193,-5.308,0.002,-9.253,-3.414
col_5,-2.9436,0.642,-4.585,0.004,-4.515,-1.373

0,1,2,3
Omnibus:,2.243,Durbin-Watson:,3.06
Prob(Omnibus):,0.326,Jarque-Bera (JB):,1.19
Skew:,0.543,Prob(JB):,0.552
Kurtosis:,1.705,Cond. No.,127.0
