In [1]:
import pandas as pd
from matplotlib import pyplot as plt
import numpy as np
import math
import seaborn as sns
import sklearn
from sklearn import linear_model
from sklearn import preprocessing
%matplotlib inline
sns.set_style('white')

In [5]:
df = pd.read_csv(
    'https://tf-assets-prod.s3.amazonaws.com/tf-curric/data-science/credit-card-defaults.csv'
).iloc[:,1:].dropna()

df['default'] = np.where(df['default']=='Yes', 1, 0)
df['student'] = np.where(df['student']=='Yes', 1, 0)
names = df.columns
df = pd.DataFrame(preprocessing.scale(df), columns=names)

trainsize = int(df.shape[0] / 2)
df_test = df.iloc[trainsize:, :].copy()
df_train = df.iloc[:trainsize, :].copy()

df_train['balance_student'] = df_train['balance'] * df_train['student']
df_train['balance_default'] = df_train['balance'] * df_train['default']
df_train['student_default'] = df_train['student'] * df_train['default']
df_train['balance_sqrt'] = (df_train['balance'] + 100) ** .5
df_train['balance2'] = (df_train['balance'] + 100) ** 2
df_train['balance3'] = (df_train['balance'] + 100) ** 3

Y_train = df_train['income'].values.reshape(-1, 1)
X_train2 = df_train.loc[:, ~(df_train.columns).isin(['income'])]

df_test['balance_student'] = df_test['balance'] * df_test['student']
df_test['balance_default'] = df_test['balance'] * df_test['default']
df_test['student_default'] = df_test['student'] * df_test['default']
df_test['balance_sqrt'] = (df_test['balance'] + 100) ** .5
df_test['balance2'] = (df_test['balance'] + 100) ** 2
df_test['balance3'] = (df_test['balance'] + 100) ** 3

Y_test = df_test['income'].values.reshape(-1, 1)
X_test2 = df_test.loc[:, ~(df_test.columns).isin(['income'])]

In [6]:
#Lasso regression:
lassBig = linear_model.Lasso(alpha=.35)
lassBig.fit(X_train2, Y_train)
print('\nR² for the model with many features:')
print(lassBig.score(X_train2, Y_train))
origparams = np.append(lassBig.coef_, lassBig.intercept_)
print('\nParameter estimates for the model with many features:')
print(origparams)
print(lassBig.score(X_test2, Y_test))


R² for the model with many features:
0.44363376712897085

Parameter estimates for the model with many features:
[ 0.00000000e+00 -3.89351238e-01  0.00000000e+00 -0.00000000e+00
  0.00000000e+00 -0.00000000e+00  0.00000000e+00 -2.77688887e-04
 -7.09158792e-07  3.48711577e+00]
0.4380466345914474


In [8]:
#Ridge regression:
ridgeBig = linear_model.Ridge(alpha=.35)
ridgeBig.fit(X_train2, Y_train)
print('\nR² for the model with many features:')
print(ridgeBig.score(X_train2, Y_train))
origparams = np.append(lassBig.coef_, lassBig.intercept_)
print('\nParameter estimates for the model with many features:')
print(origparams)
print(ridgeBig.score(X_test2, Y_test))


R² for the model with many features:
0.5739723127685943

Parameter estimates for the model with many features:
[ 0.00000000e+00 -3.89351238e-01  0.00000000e+00 -0.00000000e+00
  0.00000000e+00 -0.00000000e+00  0.00000000e+00 -2.77688887e-04
 -7.09158792e-07  3.48711577e+00]
0.5630004181468413


In [9]:
#Lasso regression:
lassBig = linear_model.Lasso(alpha=1)
lassBig.fit(X_train2, Y_train)
print('\nR² for the model with many features:')
print(lassBig.score(X_train2, Y_train))
origparams = np.append(lassBig.coef_, lassBig.intercept_)
print('\nParameter estimates for the model with many features:')
print(origparams)
print(lassBig.score(X_test2, Y_test))

#Ridge regression:
ridgeBig = linear_model.Ridge(alpha=1)
ridgeBig.fit(X_train2, Y_train)
print('\nR² for the model with many features:')
print(ridgeBig.score(X_train2, Y_train))
origparams = np.append(lassBig.coef_, lassBig.intercept_)
print('\nParameter estimates for the model with many features:')
print(origparams)
print(ridgeBig.score(X_test2, Y_test))


R² for the model with many features:
0.026834134351719777

Parameter estimates for the model with many features:
[ 0.00000000e+00 -0.00000000e+00  0.00000000e+00 -0.00000000e+00
  0.00000000e+00 -0.00000000e+00  0.00000000e+00  0.00000000e+00
 -5.42632541e-06  5.43049680e+00]
0.019253331677773544

R² for the model with many features:
0.5739722903477165

Parameter estimates for the model with many features:
[ 0.00000000e+00 -0.00000000e+00  0.00000000e+00 -0.00000000e+00
  0.00000000e+00 -0.00000000e+00  0.00000000e+00  0.00000000e+00
 -5.42632541e-06  5.43049680e+00]
0.5630027836702092


In [10]:
#Lasso regression:
lassBig = linear_model.Lasso(alpha=0)
lassBig.fit(X_train2, Y_train)
print('\nR² for the model with many features:')
print(lassBig.score(X_train2, Y_train))
origparams = np.append(lassBig.coef_, lassBig.intercept_)
print('\nParameter estimates for the model with many features:')
print(origparams)
print(lassBig.score(X_test2, Y_test))


R² for the model with many features:
0.5739444483155421

Parameter estimates for the model with many features:
[-2.06111494e-03 -7.58818422e-01  6.04810006e-02 -3.53546655e-03
  9.26788633e-03 -3.79292001e-03  1.76197742e-01 -2.05761745e-04
 -9.11532536e-07  1.20391468e+00]
0.5631593020584263


  This is separate from the ipykernel package so we can avoid doing imports until
  positive)
  positive)
