Chauvris_Xavier_A1_Regression_Analysis

# Regression Analysis: Importation of the data

In [1]:
# importing the libraries
import pandas as pd #data science essential
import numpy as np
import seaborn as sns #enhanced graphical output
import matplotlib.pyplot as plt #essential graphical output
import statsmodels.formula.api as smf #regression modeling
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split #train/test split
from sklearn.linear_model import Lasso #lasso model
from sklearn.neighbors import KNeighborsClassifier #K-nearest neighbor model
from sklearn.linear_model import ARDRegression

# specify file name
file = './birthweight_low.xlsx'

# reading the file into python
bwdf = pd.read_excel(io = file)


# outputting the first 10 rows of the dataset
bwdf.head(n = 5)

Unnamed: 0,mage,meduc,monpre,npvis,fage,feduc,omaps,fmaps,cigs,drink,...,Unnamed: 24,Unnamed: 25,Unnamed: 26,Unnamed: 27,Unnamed: 28,Unnamed: 29,Unnamed: 30,Unnamed: 31,Unnamed: 32,Unnamed: 33
0,69,,5,2.0,62,,4,7,23,9,...,,,,,,,,,,
1,68,12.0,3,10.0,61,11.0,4,6,25,11,...,,,,,,,,,,
2,71,12.0,3,6.0,46,12.0,2,7,21,12,...,,,,,,,,,,
3,59,16.0,1,8.0,48,16.0,7,8,21,10,...,,,,,,,,,,
4,48,12.0,4,6.0,39,12.0,2,9,17,13,...,,,,,,,,,,


## Removing Missing Values

In [2]:
# dropping columns containing only NaN values
bwdf = bwdf.drop(columns = ['Unnamed: 18', 'Unnamed: 19', 'Unnamed: 20', 'Unnamed: 21',\
                            'Unnamed: 22', 'Unnamed: 23', 'Unnamed: 24', 'Unnamed: 25',\
                            'Unnamed: 26', 'Unnamed: 27', 'Unnamed: 28', 'Unnamed: 29',\
                            'Unnamed: 30', 'Unnamed: 31', 'Unnamed: 32', 'Unnamed: 33'])

# checking resuslts to be sure columns were deleted
bwdf.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 196 entries, 0 to 195
Data columns (total 18 columns):
 #   Column  Non-Null Count  Dtype  
---  ------  --------------  -----  
 0   mage    196 non-null    int64  
 1   meduc   193 non-null    float64
 2   monpre  196 non-null    int64  
 3   npvis   193 non-null    float64
 4   fage    196 non-null    int64  
 5   feduc   189 non-null    float64
 6   omaps   196 non-null    int64  
 7   fmaps   196 non-null    int64  
 8   cigs    196 non-null    int64  
 9   drink   196 non-null    int64  
 10  male    196 non-null    int64  
 11  mwhte   196 non-null    int64  
 12  mblck   196 non-null    int64  
 13  moth    196 non-null    int64  
 14  fwhte   196 non-null    int64  
 15  fblck   196 non-null    int64  
 16  foth    196 non-null    int64  
 17  bwght   196 non-null    int64  
dtypes: float64(3), int64(15)
memory usage: 27.7 KB


In [3]:
# checking if the dataset contains any missing value
bwdf.isnull().any()

mage      False
meduc      True
monpre    False
npvis      True
fage      False
feduc      True
omaps     False
fmaps     False
cigs      False
drink     False
male      False
mwhte     False
mblck     False
moth      False
fwhte     False
fblck     False
foth      False
bwght     False
dtype: bool

In [4]:
# replacing missing values by the median of each column
bwdf['meduc'].fillna(bwdf['meduc'].median(), inplace=True)
bwdf['npvis'].fillna(bwdf['npvis'].median(), inplace=True)
bwdf['feduc'].fillna(bwdf['feduc'].median(), inplace=True)

In [5]:
# verifying there is no missing value left
bwdf.isnull().any().any()

False

# Running first regression analysis

In [6]:
# INSTANTIATING a model type
lm_fit = smf.ols(formula = """ bwght ~  mage+
 meduc+
 monpre+
 npvis+
 fage+
 feduc+
 omaps+
 fmaps+
 cigs+
 drink+
 male+
 mwhte+
 mblck+
 moth+
 fwhte+
 fblck+
 foth
 """,
                            data = bwdf)

# telling Python to FIT the data to the blueprint
results_1 = lm_fit.fit()


# printing a summary of the results
print(results_1.summary())

                            OLS Regression Results                            
Dep. Variable:                  bwght   R-squared:                       0.721
Model:                            OLS   Adj. R-squared:                  0.698
Method:                 Least Squares   F-statistic:                     31.06
Date:                Sat, 25 Dec 2021   Prob (F-statistic):           4.90e-42
Time:                        12:16:34   Log-Likelihood:                -1420.9
No. Observations:                 196   AIC:                             2874.
Df Residuals:                     180   BIC:                             2926.
Df Model:                          15                                         
Covariance Type:            nonrobust                                         
                 coef    std err          t      P>|t|      [0.025      0.975]
------------------------------------------------------------------------------
Intercept   2313.0728    296.759      7.794      0.0

## Remonval of the statically insignificant variables

In [7]:
# INSTANTIATING a model type
lm_fit_2 = smf.ols(formula = """ bwght ~  mage+
 cigs+
 drink+
 mwhte+
 mblck+
 moth+
 fwhte+
 fblck+
 foth
 """,
                            data = bwdf)

# telling Python to FIT the data to the blueprint
results_2 = lm_fit_2.fit()


# printing a summary of the results
print(results_2.summary())

                            OLS Regression Results                            
Dep. Variable:                  bwght   R-squared:                       0.706
Model:                            OLS   Adj. R-squared:                  0.695
Method:                 Least Squares   F-statistic:                     64.57
Date:                Sat, 25 Dec 2021   Prob (F-statistic):           1.12e-46
Time:                        12:16:34   Log-Likelihood:                -1426.0
No. Observations:                 196   AIC:                             2868.
Df Residuals:                     188   BIC:                             2894.
Df Model:                           7                                         
Covariance Type:            nonrobust                                         
                 coef    std err          t      P>|t|      [0.025      0.975]
------------------------------------------------------------------------------
Intercept   2970.7332     66.826     44.455      0.0

# Data Exploration and variables creation

In [8]:
# verifying the skewness of the distribution of each variable
bwdf.skew().sort_values(ascending=False)

npvis     2.226209
monpre    2.031858
fage      1.120425
mwhte     1.041790
mage      0.922193
foth      0.821757
fblck     0.672052
fwhte     0.648090
moth      0.648090
mblck     0.486607
drink     0.234876
cigs      0.031881
meduc    -0.071834
male     -0.206738
bwght    -0.659832
feduc    -1.338531
fmaps    -2.551277
omaps    -2.641176
dtype: float64

In [9]:
# log the variable that have a skewness higher than 2 or lower than -2
bwdf['log_npvis'] = np.log(bwdf['npvis'])
bwdf['log_monpre'] = np.log(bwdf['monpre'])
bwdf['log_fmaps'] = np.log(bwdf['fmaps'])
bwdf['log_omaps'] = np.log(bwdf['omaps'])
bwdf['log_feduc'] = np.log(bwdf['feduc'])
bwdf['log_meduc'] = np.log(bwdf['meduc'])

In [10]:
# describing the dataset to get an idea of the descriptive statistics
bwdf.describe()

Unnamed: 0,mage,meduc,monpre,npvis,fage,feduc,omaps,fmaps,cigs,drink,...,fwhte,fblck,foth,bwght,log_npvis,log_monpre,log_fmaps,log_omaps,log_feduc,log_meduc
count,196.0,196.0,196.0,196.0,196.0,196.0,196.0,196.0,196.0,196.0,...,196.0,196.0,196.0,196.0,196.0,196.0,196.0,196.0,196.0,196.0
mean,40.153061,13.913265,2.341837,11.607143,39.290816,13.852041,8.193878,8.964286,10.928571,5.397959,...,0.346939,0.341837,0.311224,3334.086735,2.392537,0.718551,2.190158,2.07074,2.597524,2.62183
std,10.250055,2.040017,1.355136,4.234625,8.982725,2.586661,1.576482,0.651428,6.101282,3.001674,...,0.477215,0.47554,0.46418,646.700904,0.352281,0.503378,0.082413,0.299038,0.313256,0.150067
min,23.0,8.0,1.0,2.0,23.0,1.0,2.0,5.0,0.0,0.0,...,0.0,0.0,0.0,697.0,0.693147,0.0,1.609438,0.693147,0.0,2.079442
25%,33.0,12.0,2.0,10.0,34.75,12.0,8.0,9.0,6.0,4.0,...,0.0,0.0,0.0,2916.25,2.302585,0.693147,2.197225,2.079442,2.484907,2.484907
50%,39.0,14.0,2.0,12.0,38.0,14.0,9.0,9.0,11.0,5.0,...,0.0,0.0,0.0,3452.0,2.484907,0.693147,2.197225,2.197225,2.639057,2.639057
75%,46.0,16.0,3.0,12.0,43.0,16.0,9.0,9.0,15.25,7.25,...,1.0,1.0,1.0,3759.5,2.484907,1.098612,2.197225,2.197225,2.772589,2.772589
max,71.0,17.0,8.0,35.0,73.0,17.0,10.0,10.0,25.0,14.0,...,1.0,1.0,1.0,4933.0,3.555348,2.079442,2.302585,2.302585,2.833213,2.833213


In [11]:
# creating a new variable from the product of fage and mage
bwdf['fage_mage'] = bwdf['fage']*bwdf['mage']

# logging this new variable
bwdf['log_fage_mage'] = np.log(bwdf['fage_mage'])

In [12]:
# verifying the skewness is better than what was observe for each variable
bwdf['log_fage_mage'].skew()

0.6389637013639178

In [13]:
# placeholder variables
bwdf['fageH_mageL'] = 0
bwdf['fageL_mageH'] = 0
bwdf['fageH_mageH'] = 0
bwdf['fageL_mageL'] = 0

# iterating over each original column to
# change values in the new feature columns
for index, row in bwdf.iterrows():
    
    # fage, mage, fageH_mageL 
    if bwdf.loc[index, 'fage'] >= 40 and bwdf.loc[index, 'mage'] < 40:
        bwdf.loc[index, 'fageH_mageL'] = 1    
    
    # fage, mage, fageL_mageH
    elif bwdf.loc[index, 'mage'] >= 40 and bwdf.loc[index, 'fage'] < 40:
        bwdf.loc[index, 'fageL_mageH'] = 1
        
    # fage, mage, fageH_mageH    
    elif bwdf.loc[index, 'mage'] >= 40 and bwdf.loc[index, 'fage'] >= 40:
        bwdf.loc[index, 'fageH_mageH'] = 1 
        
    # fage, mage, fageL_mageL
    elif bwdf.loc[index, 'mage'] < 40 and bwdf.loc[index, 'fage'] < 40:
        bwdf.loc[index, 'fageL_mageL'] = 1 

In [14]:
# placeholder variables
bwdf['omapsH_fmapsL'] = 0
bwdf['omapsL_fmapsH'] = 0
bwdf['omapsH_fmapsH'] = 0
bwdf['omapsL_fmapsL'] = 0

# iterating over each original column to
# change values in the new feature columns
for index, row in bwdf.iterrows():
    
    # omaps, fmaps, omapsH_fmapsL
    if bwdf.loc[index, 'omaps'] >= 8 and bwdf.loc[index, 'fmaps'] <= 8:
        bwdf.loc[index, 'omapsH_fmapsL'] = 1    
    
    # omaps, fmaps, omapsL_fmapsH
    elif bwdf.loc[index, 'omaps'] < 8 and bwdf.loc[index, 'fmaps'] > 8:
        bwdf.loc[index, 'omapsL_fmapsH'] = 1
    
    # omaps, fmaps, omapsH_fmapsH
    elif bwdf.loc[index, 'omaps'] >= 8 and bwdf.loc[index, 'fmaps'] > 8:
        bwdf.loc[index, 'omapsH_fmapsH'] = 1    
    
    # omaps, fmaps, omapsL_fmapsL
    elif bwdf.loc[index, 'omaps'] < 8 and bwdf.loc[index, 'fmaps'] <= 8:
        bwdf.loc[index, 'omapsL_fmapsL'] = 1 

# Running Final linear regression before model preparation

In [15]:
# INSTANTIATING a model type
lm_fit_3 = smf.ols(formula = """ bwght ~  log_fage_mage+
 cigs+
 drink+
 mwhte+
 mblck+
 moth+
 fwhte+
 fblck+
 foth+
 fageH_mageL+
 fageL_mageH+
 fageH_mageH+
 fageL_mageL
 """,  data = bwdf)

# telling Python to FIT the data to the blueprint
results_3 = lm_fit_3.fit()


# printing a summary of the results
print(results_3.summary())

                            OLS Regression Results                            
Dep. Variable:                  bwght   R-squared:                       0.706
Model:                            OLS   Adj. R-squared:                  0.690
Method:                 Least Squares   F-statistic:                     44.49
Date:                Sat, 25 Dec 2021   Prob (F-statistic):           5.19e-44
Time:                        12:16:34   Log-Likelihood:                -1426.0
No. Observations:                 196   AIC:                             2874.
Df Residuals:                     185   BIC:                             2910.
Df Model:                          10                                         
Covariance Type:            nonrobust                                         
                    coef    std err          t      P>|t|      [0.025      0.975]
---------------------------------------------------------------------------------
Intercept      3247.7811    473.816      6.855

# Model Development Preparation

In [16]:
# displaying columns in order to ease the process for the nex cell
bwdf.columns

Index(['mage', 'meduc', 'monpre', 'npvis', 'fage', 'feduc', 'omaps', 'fmaps',
       'cigs', 'drink', 'male', 'mwhte', 'mblck', 'moth', 'fwhte', 'fblck',
       'foth', 'bwght', 'log_npvis', 'log_monpre', 'log_fmaps', 'log_omaps',
       'log_feduc', 'log_meduc', 'fage_mage', 'log_fage_mage', 'fageH_mageL',
       'fageL_mageH', 'fageH_mageH', 'fageL_mageL', 'omapsH_fmapsL',
       'omapsL_fmapsH', 'omapsH_fmapsH', 'omapsL_fmapsL'],
      dtype='object')

In [17]:
# preparing explanatory variables data
bwdf_data   = bwdf.drop(['mage', 'meduc', 'monpre', 'npvis', 'fage', 'feduc', 'omaps', 'fmaps',
       'male', 'bwght', 'log_npvis', 'log_monpre', 'log_fmaps', 'log_omaps',
       'log_feduc', 'log_meduc', 'fage_mage'],
                               axis = 1)


# preparing response variables
bwdf_target = bwdf.loc[ : , 'bwght']


# preparing training and testing sets (all letters are lowercase)
x_train, x_test, y_train, y_test = train_test_split(
            bwdf_data,
            bwdf_target,
            test_size = 0.25,
            random_state = 219)


# checking the shapes of the datasets
print(f"""
Training Data
-------------
X-side: {x_train.shape}
y-side: {y_train.shape}


Testing Data
------------
X-side: {x_test.shape}
y-side: {y_test.shape}
""")


Training Data
-------------
X-side: (147, 17)
y-side: (147,)


Testing Data
------------
X-side: (49, 17)
y-side: (49,)



# Final Regression Before Model Development

In [18]:
# displaying columns in order to ease the process for the nex cell
bwdf_data.columns

Index(['cigs', 'drink', 'mwhte', 'mblck', 'moth', 'fwhte', 'fblck', 'foth',
       'log_fage_mage', 'fageH_mageL', 'fageL_mageH', 'fageH_mageH',
       'fageL_mageL', 'omapsH_fmapsL', 'omapsL_fmapsH', 'omapsH_fmapsH',
       'omapsL_fmapsL'],
      dtype='object')

In [19]:
# declaring set of x-variables
x_variables = ['cigs', 'drink', 'mwhte', 'mblck', 'moth', 'fwhte', 'fblck', 'foth',
       'log_fage_mage', 'fageH_mageL', 'fageL_mageH', 'fageH_mageH',
       'fageL_mageL', 'omapsH_fmapsL', 'omapsL_fmapsH', 'omapsH_fmapsH',
       'omapsL_fmapsL']


# looping to make x-variables suitable for statsmodels
for val in x_variables:
    print(f"{val} +")

cigs +
drink +
mwhte +
mblck +
moth +
fwhte +
fblck +
foth +
log_fage_mage +
fageH_mageL +
fageL_mageH +
fageH_mageH +
fageL_mageL +
omapsH_fmapsL +
omapsL_fmapsH +
omapsH_fmapsH +
omapsL_fmapsL +


In [20]:
# merging X_train and y_train so that they can be used in statsmodels
bwdf_train = pd.concat([x_train, y_train], axis = 1)


# Step 1: build a model
lm_best = smf.ols(formula =  """bwght ~ cigs +
drink +
mwhte +
mblck +
moth +
fwhte +
fblck +
foth +
log_fage_mage +
fageH_mageL +
fageL_mageH +
fageH_mageH +
fageL_mageL +
omapsH_fmapsL +
omapsL_fmapsH +
omapsH_fmapsH +
omapsL_fmapsL""",                   data = bwdf_train)


# Step 2: fit the model based on the data
results = lm_best.fit()



# Step 3: analyze the summary output
print(results.summary())

                            OLS Regression Results                            
Dep. Variable:                  bwght   R-squared:                       0.720
Model:                            OLS   Adj. R-squared:                  0.693
Method:                 Least Squares   F-statistic:                     26.31
Date:                Sat, 25 Dec 2021   Prob (F-statistic):           1.39e-30
Time:                        12:16:35   Log-Likelihood:                -1071.1
No. Observations:                 147   AIC:                             2170.
Df Residuals:                     133   BIC:                             2212.
Df Model:                          13                                         
Covariance Type:            nonrobust                                         
                    coef    std err          t      P>|t|      [0.025      0.975]
---------------------------------------------------------------------------------
Intercept      3213.1542    515.283      6.236

# Model Development

## Final Data Split

In [21]:
# preparing x-variables from the OLS model
ols_data = bwdf.loc[:, x_variables]


# preparing response variable
bwdf_target = bwdf.loc[:, 'bwght']


###############################################
## setting up more than one train-test split ##
###############################################
# FULL X-dataset (normal Y)
x_train_FULL, x_test_FULL, y_train_FULL, y_test_FULL = train_test_split(
            bwdf_data,     # x-variables
            bwdf_target,   # y-variable
            test_size = 0.25,
            random_state = 219)


# OLS p-value x-dataset (normal Y)
x_train_OLS, x_test_OLS, y_train_OLS, y_test_OLS = train_test_split(
            ols_data,         # x-variables
            bwdf_target,   # y-variable
            test_size = 0.25,
            random_state = 219)

## OLS Regression

In [22]:
# INSTANTIATING a model object
lr = LinearRegression()


# FITTING to the training data
lr_fit = lr.fit(x_train, y_train)


# PREDICTING on new data
lr_pred = lr_fit.predict(x_test)


# SCORING the results
print('OLS Training Score :', lr.score(x_train, y_train).round(decimals = 4))  # using R-square
print('OLS Testing Score  :',  lr.score(x_test, y_test).round(decimals = 4)) # using R-square

lr_train_score = lr.score(x_train, y_train).round(decimals = 4)
lr_test_score = lr.score(x_test, y_test).round(decimals = 4)


# displaying and saving the gap between training and testing
print('OLS Train-Test Gap :', abs(lr_train_score - lr_test_score).round(decimals = 4))
lr_test_gap = abs(lr_train_score - lr_test_score).round(decimals = 4)

OLS Training Score : 0.72
OLS Testing Score  : 0.6235
OLS Train-Test Gap : 0.0965


In [23]:
# zipping each feature name to its coefficient
lr_model_values = zip(bwdf_data[x_variables].columns,
                      lr_fit.coef_.round(decimals = 2))


# setting up a placeholder list to store model features
lr_model_lst = [('intercept', lr_fit.intercept_.round(decimals = 2))]


# printing out each feature-coefficient pair one by one
for val in lr_model_values:
    lr_model_lst.append(val)
    

# checking the results
for pair in lr_model_lst:
    print(pair)

('intercept', 6961.83)
('cigs', -36.3)
('drink', -110.36)
('mwhte', 106.72)
('mblck', 12.61)
('moth', -119.33)
('fwhte', -85.13)
('fblck', -23.09)
('foth', 108.23)
('log_fage_mage', -361.81)
('fageH_mageL', 119.15)
('fageL_mageH', 63.25)
('fageH_mageH', -102.94)
('fageL_mageL', -79.47)
('omapsH_fmapsL', 57.9)
('omapsL_fmapsH', -35.57)
('omapsH_fmapsH', 102.36)
('omapsL_fmapsL', -124.69)


## Lasso Regression

In [24]:
# INSTANTIATING a model object
lasso_model = Lasso(alpha = 1.0, normalize = True) # default magitude


# FITTING to the training data
lasso_fit = lasso_model.fit(x_train_FULL, y_train_FULL)


# PREDICTING on new data
lasso_pred = lasso_fit.predict(x_test_FULL)


# SCORING the results
print('Lasso Training Score :', lasso_model.score(x_train_FULL, y_train_FULL).round(decimals = 4))
print('Lasso Testing Score  :', lasso_model.score(x_test_FULL, y_test_FULL).round(4))


# saving scoring data for future use
lasso_train_score = lasso_model.score(x_train_FULL, y_train_FULL).round(decimals = 4) # using R-square
lasso_test_score  = lasso_model.score(x_test_FULL, y_test_FULL).round(decimals = 4)   # using R-square


# displaying and saving the gap between training and testing
print('Lasso Train-Test Gap :', abs(lasso_train_score - lasso_test_score).round(decimals = 4))
lasso_test_gap = abs(lasso_train_score - lasso_test_score).round(decimals = 4)

Lasso Training Score : 0.7139
Lasso Testing Score  : 0.6734
Lasso Train-Test Gap : 0.0405


In [25]:
# zipping each feature name to its coefficient
lasso_model_values = zip(bwdf_data.columns, lasso_fit.coef_.round(decimals = 2))


# setting up a placeholder list to store model features
lasso_model_lst = [('intercept', lasso_fit.intercept_.round(decimals = 2))]


# printing out each feature-coefficient pair one by one
for val in lasso_model_values:
    lasso_model_lst.append(val)
    

# checking the results
for pair in lasso_model_lst:
    print(pair)

('intercept', 6283.48)
('cigs', -35.51)
('drink', -107.47)
('mwhte', 0.0)
('mblck', 0.0)
('moth', -16.56)
('fwhte', -0.0)
('fblck', 0.0)
('foth', 0.0)
('log_fage_mage', -273.63)
('fageH_mageL', 81.39)
('fageL_mageH', 66.32)
('fageH_mageH', -105.44)
('fageL_mageL', -0.0)
('omapsH_fmapsL', 0.0)
('omapsL_fmapsH', -0.0)
('omapsH_fmapsH', 67.34)
('omapsL_fmapsL', -107.93)


## ARD Regression

In [26]:
# INSTANTIATING a model object
ard_model = ARDRegression()


# FITTING the training data
ard_fit = ard_model.fit(x_train, y_train)


# PREDICTING on new data
ard_pred = ard_fit.predict(x_test)


print('Training Score:', ard_model.score(x_train, y_train).round(decimals = 4))
print('Testing Score :', ard_model.score(x_test, y_test).round(decimals = 4))


# saving scoring data for future use
ard_train_score = ard_model.score(x_train, y_train).round(decimals = 4)
ard_test_score  = ard_model.score(x_test, y_test).round(decimals = 4)


# displaying and saving the gap between training and testing
print('ARD Train-Test Gap :', abs(ard_train_score - ard_test_score).round(decimals = 4))
ard_test_gap = abs(ard_train_score - ard_test_score).round(decimals = 4)

Training Score: 0.7114
Testing Score : 0.6843
ARD Train-Test Gap : 0.0271


In [27]:
# zipping each feature name to its coefficient
ard_model_values = zip(bwdf_data.columns, ard_fit.coef_.round(decimals = 5))


# setting up a placeholder list to store model features
ard_model_lst = [('intercept', ard_fit.intercept_.round(decimals = 2))]


# printing out each feature-coefficient pair one by one
for val in ard_model_values:
    ard_model_lst.append(val)
    

# checking the results
for pair in ard_model_lst:
    print(pair)

('intercept', 6372.34)
('cigs', -36.46144)
('drink', -111.12127)
('mwhte', 0.00033)
('mblck', 0.00044)
('moth', -0.00086)
('fwhte', -0.00031)
('fblck', 0.00027)
('foth', 4e-05)
('log_fage_mage', -270.68955)
('fageH_mageL', 0.00114)
('fageL_mageH', 0.00165)
('fageH_mageH', -160.26716)
('fageL_mageL', -37.50414)
('omapsH_fmapsL', -7e-05)
('omapsL_fmapsH', -0.0007)
('omapsH_fmapsH', 26.0684)
('omapsL_fmapsL', -104.25547)


# Model Comparison

In [29]:
# comparing models
print(f"""
Model      Train Score      Test Score      Train-Test Gap
-----      -----------      ----------      --------------
OLS        {lr_train_score}             {lr_test_score}           {lr_test_gap}
Lasso      {lasso_train_score}           {lasso_test_score}           {lasso_test_gap}
ARD(Final) {ard_train_score}           {ard_test_score}           {ard_test_gap}
""")

# creating a dictionary for model results
model_performance = {
    
    'Model Type'    : ['OLS', 'Lasso', 'ARD*'],
           
    'Training' : [lr_train_score, lasso_train_score,
                                   ard_train_score],
           
    'Testing'  : [lr_test_score, lasso_test_score,
                                   ard_test_score],
                    
    'Train-Test Gap' : [lr_test_gap, lasso_test_gap,
                                        ard_test_gap],
                    
    'Model Size' : [len(lr_model_lst), len(lasso_model_lst),
                                    len(ard_model_lst)]}


# converting model_performance into a DataFrame
model_performance = pd.DataFrame(model_performance)




Model      Train Score      Test Score      Train-Test Gap
-----      -----------      ----------      --------------
OLS        0.72             0.6235           0.0965
Lasso      0.7139           0.6734           0.0405
ARD(Final) 0.7114           0.6843           0.0271

