In [58]:
import numpy as np
import bdsm
import statsmodels.formula.api as smf
from patsy.builtins import Q
import patsy

from collections.abc import Iterable

In [10]:
abalones_ds = bdsm.abalones()

In [14]:
df_abalones_sex = abalones_ds.for_regression(sex = 'M')
df_abalones_sex

Unnamed: 0,Length,Diameter,Height,Whole weight,Shucked weight,Viscera weight,Shell weight,Rings
0,0.455,0.365,0.095,0.5140,0.2245,0.1010,0.1500,15
1,0.350,0.265,0.090,0.2255,0.0995,0.0485,0.0700,7
3,0.440,0.365,0.125,0.5160,0.2155,0.1140,0.1550,10
8,0.475,0.370,0.125,0.5095,0.2165,0.1125,0.1650,9
11,0.430,0.350,0.110,0.4060,0.1675,0.0810,0.1350,10
...,...,...,...,...,...,...,...,...
4170,0.550,0.430,0.130,0.8395,0.3155,0.1955,0.2405,10
4171,0.560,0.430,0.155,0.8675,0.4000,0.1720,0.2290,8
4173,0.590,0.440,0.135,0.9660,0.4390,0.2145,0.2605,10
4174,0.600,0.475,0.205,1.1760,0.5255,0.2875,0.3080,9


In [45]:
def gen_lm_string(dependent: Iterable[str], dimensions: Iterable[str]):
    if isinstance(dependent, str):
        dependent = [dependent]
    if isinstance(dimensions, str):
        dimensions = [dimensions]

    depStr = ' + '.join([f"Q('{it}')" for it in dependent])
    print(depStr)
    dimStr = ' + '.join([f"Q('{it}')" for it in dimensions if it not in dependent])
    return f"{depStr} ~ {dimStr}"

In [50]:
exclude = ["Rings", "Length"]
columns = [it for it in df_abalones_sex.columns if it not in exclude]

res = smf.ols(gen_lm_string("Rings", columns), data=df_abalones_sex).fit()
res.summary()

0,1,2,3
Dep. Variable:,Q('Rings'),R-squared:,0.44
Model:,OLS,Adj. R-squared:,0.437
Method:,Least Squares,F-statistic:,198.9
Date:,"Tue, 09 Nov 2021",Prob (F-statistic):,2.87e-187
Time:,10:30:33,Log-Likelihood:,-3417.2
No. Observations:,1528,AIC:,6848.0
Df Residuals:,1521,BIC:,6886.0
Df Model:,6,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
Intercept,5.1616,0.519,9.954,0.000,4.144,6.179
Q('Diameter'),4.6011,1.810,2.542,0.011,1.050,8.152
Q('Height'),14.9013,3.411,4.369,0.000,8.211,21.592
Q('Whole weight'),8.6668,1.136,7.628,0.000,6.438,10.895
Q('Shucked weight'),-18.8064,1.256,-14.978,0.000,-21.269,-16.343
Q('Viscera weight'),-10.1989,1.988,-5.131,0.000,-14.098,-6.300
Q('Shell weight'),10.6912,1.778,6.012,0.000,7.203,14.179

0,1,2,3
Omnibus:,257.136,Durbin-Watson:,1.476
Prob(Omnibus):,0.0,Jarque-Bera (JB):,544.49
Skew:,0.972,Prob(JB):,5.83e-119
Kurtosis:,5.184,Cond. No.,99.0


## Forward Select

In [186]:
def feature_selection(data, label, all_columns):
    all_columns = [it for it in all_columns if it not in label]
    toAdd = []

    while True:
        exclude = [label]
        exclude.extend(toAdd)
        columns = [it for it in all_columns if it not in exclude]

        def calc_rss(dependent: Iterable[str], dimensions: Iterable[str], pvalue_threshold = 0.05):
            res = smf.ols(gen_lm_string(dependent, dimensions), data=data).fit()

            y_hat = res.predict()
            y = data[label].to_numpy()
            RSS = np.sum((y - y_hat)**2)
            pvalue = res.pvalues[-1]
            if pvalue <= pvalue_threshold:
                return {
                    "dimension": list(dimensions)[-1],
                    "pvalue": pvalue,
                    "RSS": RSS,
                }
            return None

        nextValues = []
        for l in columns:
            toAddDims = toAdd.copy()
            toAddDims.append(l)
            obj = calc_rss("Rings", toAddDims)
            if obj is not None: nextValues.append(obj)
        nextValues.sort(key=lambda it: it["RSS"], reverse=True)
        if len(nextValues) == 0:
            return toAdd
        obj = nextValues.pop()
        toAdd.append(obj["dimension"])

features = feature_selection(df_abalones_sex, "Rings", df_abalones_sex.columns)
features

['Shell weight',
 'Shucked weight',
 'Whole weight',
 'Height',
 'Viscera weight',
 'Diameter']

In [185]:
res = smf.ols(gen_lm_string(["Rings"], features), data=df_abalones_sex).fit()
res.summary()


0,1,2,3
Dep. Variable:,Q('Rings'),R-squared:,0.44
Model:,OLS,Adj. R-squared:,0.437
Method:,Least Squares,F-statistic:,198.9
Date:,"Tue, 09 Nov 2021",Prob (F-statistic):,2.87e-187
Time:,11:52:02,Log-Likelihood:,-3417.2
No. Observations:,1528,AIC:,6848.0
Df Residuals:,1521,BIC:,6886.0
Df Model:,6,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
Intercept,5.1616,0.519,9.954,0.000,4.144,6.179
Q('Shell weight'),10.6912,1.778,6.012,0.000,7.203,14.179
Q('Shucked weight'),-18.8064,1.256,-14.978,0.000,-21.269,-16.343
Q('Whole weight'),8.6668,1.136,7.628,0.000,6.438,10.895
Q('Height'),14.9013,3.411,4.369,0.000,8.211,21.592
Q('Viscera weight'),-10.1989,1.988,-5.131,0.000,-14.098,-6.300
Q('Diameter'),4.6011,1.810,2.542,0.011,1.050,8.152

0,1,2,3
Omnibus:,257.136,Durbin-Watson:,1.476
Prob(Omnibus):,0.0,Jarque-Bera (JB):,544.49
Skew:,0.972,Prob(JB):,5.83e-119
Kurtosis:,5.184,Cond. No.,99.0
