In [1]:
import pandas as pd
import numpy as np

from sklearn import metrics
from sklearn import linear_model

from sklearn.pipeline import Pipeline

from sklearn.preprocessing import PolynomialFeatures
from sklearn.model_selection import GridSearchCV, train_test_split


In [2]:
df = pd.read_stata('http://fmwww.bc.edu/ec-p/data/wooldridge/card.dta')

In [3]:
# There are some NaN values. we will fill them with medians
df = df.fillna(df.median())

In [4]:
df.head()

Unnamed: 0,id,nearc2,nearc4,educ,age,fatheduc,motheduc,weight,momdad14,sinmom14,...,smsa66,wage,enroll,KWW,IQ,married,libcrd14,exper,lwage,expersq
0,2.0,0.0,0.0,7.0,29.0,10.0,12.0,158413.0,1.0,0.0,...,1.0,548.0,0.0,15.0,103.0,1.0,0.0,16.0,6.306275,256.0
1,3.0,0.0,0.0,12.0,27.0,8.0,8.0,380166.0,1.0,0.0,...,1.0,481.0,0.0,35.0,93.0,1.0,1.0,9.0,6.175867,81.0
2,4.0,0.0,0.0,12.0,34.0,14.0,12.0,367470.0,1.0,0.0,...,1.0,721.0,0.0,42.0,103.0,1.0,1.0,16.0,6.580639,256.0
3,5.0,1.0,1.0,11.0,27.0,11.0,12.0,380166.0,1.0,0.0,...,1.0,250.0,0.0,25.0,88.0,1.0,1.0,10.0,5.521461,100.0
4,6.0,1.0,1.0,12.0,34.0,8.0,7.0,367470.0,1.0,0.0,...,1.0,729.0,0.0,34.0,108.0,1.0,0.0,16.0,6.591674,256.0


In [5]:
df.shape

(3010, 34)

In [6]:
df.columns

Index(['id', 'nearc2', 'nearc4', 'educ', 'age', 'fatheduc', 'motheduc',
       'weight', 'momdad14', 'sinmom14', 'step14', 'reg661', 'reg662',
       'reg663', 'reg664', 'reg665', 'reg666', 'reg667', 'reg668', 'reg669',
       'south66', 'black', 'smsa', 'south', 'smsa66', 'wage', 'enroll', 'KWW',
       'IQ', 'married', 'libcrd14', 'exper', 'lwage', 'expersq'],
      dtype='object')

In [7]:
df.dtypes

id          float32
nearc2      float32
nearc4      float32
educ        float32
age         float32
fatheduc    float32
motheduc    float32
weight      float32
momdad14    float32
sinmom14    float32
step14      float32
reg661      float32
reg662      float32
reg663      float32
reg664      float32
reg665      float32
reg666      float32
reg667      float32
reg668      float32
reg669      float32
south66     float32
black       float32
smsa        float32
south       float32
smsa66      float32
wage        float32
enroll      float32
KWW         float32
IQ          float32
married     float32
libcrd14    float32
exper       float32
lwage       float32
expersq     float32
dtype: object

In [7]:
# set X & y
X = ['married', 'exper', 'expersq',
     'nearc2', 'nearc4', 'fatheduc', 'motheduc',
     'weight', 'momdad14', 'sinmom14', 'step14', 
     'reg661', 'reg662', 'reg663', 'reg664', 
     'reg665', 'reg666', 'reg667', 'reg668',
     'south66', 'black', 'smsa', 'south', 'smsa66']

y = ['educ']

In [10]:
def test(models, data, iterations = 100):
    
    results = {}
    
    for i in models:
        r2_train = []
        r2_test  = []
        
        for j in range(iterations):
            
            X_train, X_test, y_train, y_test = train_test_split(data[X], 
                                                                data[y], 
                                                                test_size= 0.2)
            r2_test.append(metrics.r2_score(y_test, models[i].fit(X_train, y_train).predict(X_test)))
            
            r2_train.append(metrics.r2_score(y_train, 
                                             models[i].fit(X_train, 
                                                          y_train).predict(X_train)))
            
        results[i] = [np.mean(r2_train), np.mean(r2_test)]
        
    return pd.DataFrame(results)

In [9]:
models = {'OLS': linear_model.LinearRegression(),
          'Lasso': linear_model.Lasso(),
          'Ridge': linear_model.Ridge(),}

In [11]:
test(models, df)

Unnamed: 0,OLS,Lasso,Ridge
0,0.535831,0.473276,0.535838
1,0.529073,0.457866,0.529011


As you see, OLS performs better than both LASSO and Ridge. On the other hand, LASSO performs really bad. 

Let’s do a Grid Search:

In [13]:
lasso_params = {'alpha':[0.02, 0.024, 0.025, 0.026, 0.03]}
ridge_params = {'alpha':[200, 230, 250,265, 270, 275, 290, 300, 500]}

models2 = {'OLS': linear_model.LinearRegression(),
           'Lasso': GridSearchCV(linear_model.Lasso(), 
                               param_grid=lasso_params).fit(df[X], df[y]).best_estimator_,
           'Ridge': GridSearchCV(linear_model.Ridge(), 
                               param_grid=ridge_params).fit(df[X], df[y]).best_estimator_,}



In [14]:
test(models2, df)

Unnamed: 0,OLS,Lasso,Ridge
0,0.537243,0.532011,0.53146
1,0.52383,0.525379,0.528462
