In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import math
%matplotlib inline

import sklearn
from sklearn import linear_model
from sklearn import preprocessing
import statsmodels.api as sm

## Load dataset

In [2]:
## Load data

gss = pd.read_csv('../../GSS_2012/ICPSR_35478/DS0001/35478-0001-Data.tsv', sep='\t')
gss.head()

Unnamed: 0,YEAR,ID,INTID,FEEUSED,FEELEVEL,DATEINTV,LNGTHINV,INTAGE,INTETHN,MODE,...,SAMPCODE,SAMPLE,OVERSAMP,WTSS,WTSSNR,WTSSALL,WTCOMB,WTCOMBNR,VSTRAT,VPSU
0,2012,1,49,1,75,721,69,60,1,1,...,601,10,1,2.621963,2.869532,2.621963,6.402159,7.006659,-1,-1
1,2012,2,150,1,75,624,53,32,1,1,...,601,10,1,3.49595,3.826043,3.49595,6.514477,7.129583,-1,-1
2,2012,3,150,1,75,627,77,32,1,1,...,601,10,1,1.747975,1.913021,1.747975,1.67113,1.82892,-1,-1
3,2012,4,49,1,20,527,78,60,1,1,...,601,10,1,1.235694,1.35237,1.235694,1.18137,1.292917,-1,-1
4,2012,5,235,1,75,620,149,62,1,1,...,601,10,1,0.873988,0.956511,0.873988,0.835565,0.91446,-1,-1


## Choose variables

In [3]:
## Choose some features

gss_select = pd.DataFrame()
gss_select['id'] = gss['ID']
gss_select['wrkstat'] = gss['WRKSTAT']
## Work status

gss_select['marital'] = gss['MARITAL']
## 1 = married, 2 = widowed, 3 = divorced, 4 = separated, 5 = never married 9 = no answer

gss_select['divorce'] = gss['DIVORCE']
## 1 = yes, 2 = no, 0 = inapplicable, 8,9 = don't know/no answer

gss_select['sibs'] = gss['SIBS']
gss_select['children'] = gss['CHILDS']
gss_select['age'] = gss['AGE']
gss_select['age_1_child'] = gss['AGEKDBRN']
## Age first child born

gss_select['degree'] = gss['DEGREE']
## 0 = less than high school, 1 = high school, 2 = junior college, 
## 3 = bachelor, 4 = graduate

gss_select['sex'] = gss['SEX']
gss_select.head()

Unnamed: 0,id,wrkstat,marital,divorce,sibs,children,age,age_1_child,degree,sex
0,1,2,5,0,1,0,22,0,3,1
1,2,2,5,0,2,0,21,0,1,1
2,3,1,1,2,1,2,42,32,1,1
3,4,8,1,1,2,2,49,24,1,2
4,5,5,4,0,0,3,70,24,3,2


In [4]:
## Clean some features

gss_clean = pd.DataFrame()

## Change full time, part time and with a job (1, 2, 3) to employed, otherwise unemployed
gss_clean['wrkstat'] = np.where(gss_select['wrkstat'].isin([1, 2, 3]), 1, 0)

## Currently divorced or separated (3, 4)
gss_clean['marital'] = np.where(gss_select['marital'].isin([3, 4]), 1, 0)

## Ever divorced or separated (1)
gss_clean['divorce'] = np.where(gss_select['divorce'] == 1, 1, 0)

## 1 = Male, 2 = female, change female to 0
gss_clean['sex'] = np.where(gss_select['sex'] == 2, 0, 1)

gss_clean
gss_clean.head()

Unnamed: 0,wrkstat,marital,divorce,sex
0,1,0,0,1
1,1,0,0,1
2,1,0,0,1
3,0,0,1,0
4,0,1,0,0


In [5]:
## Create features dataframe

gss_features = gss_select.copy()
gss_features.drop(['id', 'wrkstat', 'marital', 'divorce', 'sex'], axis=1, inplace=True)
gss_features['wrkstat'] = gss_clean['wrkstat']
gss_features['sex'] = gss_clean['sex']
gss_features['div_sep'] = gss_clean['marital'] + gss_clean['divorce']
gss_features.head()

Unnamed: 0,sibs,children,age,age_1_child,degree,wrkstat,sex,div_sep
0,1,0,22,0,3,1,1,0
1,2,0,21,0,1,1,1,0
2,1,2,42,32,1,1,1,0
3,2,2,49,24,1,0,0,1
4,0,3,70,24,3,0,0,1


## Check features for sense and remove some values

In [6]:
## Check for sense - all people with no children have no 1st child age

gss_features.loc[gss_features['children'] == 0, 'age_1_child'].unique()

array([0])

In [7]:
## Remove unknown values for siblings

gss_features.sibs.unique()

array([ 1,  2,  0,  4,  6,  7,  5,  9,  3, 10, 15,  8, 13, 12, 11, 98, 99,
       14, 30, 16, 17, 22, 18, 20, 21, 58, 19])

In [8]:
sibs_drop = gss_features.loc[gss_features.sibs.isin([98, 99]),].index
gss_features.drop(sibs_drop, inplace=True)
gss_features.sibs.unique()

array([ 1,  2,  0,  4,  6,  7,  5,  9,  3, 10, 15,  8, 13, 12, 11, 14, 30,
       16, 17, 22, 18, 20, 21, 58, 19])

In [9]:
## Check number of children - 9 is don't know so remove

gss_features.children.value_counts()

2    1372
0    1250
3     784
1     696
4     390
5     155
6      75
8      52
7      35
9       4
Name: children, dtype: int64

In [10]:
children_drop = gss_features.loc[gss_features.children == 9, ].index
gss_features.drop(children_drop, inplace=True)
gss_features.children.value_counts()

2    1372
0    1250
3     784
1     696
4     390
5     155
6      75
8      52
7      35
Name: children, dtype: int64

In [11]:
## Remove no answer to age

age_drop = gss_features.loc[gss_features['age'] == 99, ].index
gss_features.drop(age_drop, inplace=True)

In [12]:
## Remove don't know and no answer to age when 1st child born

child_age_drop = gss_features.loc[gss_features['age_1_child'].isin([98, 99]),].index
gss_features.drop(child_age_drop, inplace=True)

In [13]:
## Check 0s all make sense
gss_features.loc[gss_features['age_1_child'] == 0, 'children'].unique()

array([0])

In [14]:
gss_features.degree.unique()

array([3, 1, 2, 0, 4])

In [15]:
gss_features.reset_index(inplace=True, drop=True)

In [16]:
## Data retained:

len(gss_features) / len(gss)

0.9852697095435685

In [17]:
gss_features.head()

Unnamed: 0,sibs,children,age,age_1_child,degree,wrkstat,sex,div_sep
0,1,0,22,0,3,1,1,0
1,2,0,21,0,1,1,1,0
2,1,2,42,32,1,1,1,0
3,2,2,49,24,1,0,0,1
4,0,3,70,24,3,0,0,1


In [18]:
sum(gss_features.div_sep)/len(gss_features)

0.337544746262371

## Engineer more features

In [19]:
gss_features['age_age_1_child'] = gss_features['age'] * gss_features['age_1_child']
gss_features['children_age_1_child'] = gss_features['children'] * gss_features['age_1_child']
gss_features['sibs_children'] = gss_features['sibs'] * gss_features['children']
gss_features['degree_wrkstat'] = gss_features['degree'] * gss_features['wrkstat']
gss_features['wrkstat_sex'] = gss_features['wrkstat'] * gss_features['sex']
gss_features['age_1_child_sex'] = gss_features['age_1_child'] * gss_features['sex']
gss_features['age2'] = gss_features['age'] ** 2
gss_features['age3'] = gss_features['age'] ** 3
gss_features['age_sqrt'] = gss_features['age'] ** 0.5

gss_features.head()

Unnamed: 0,sibs,children,age,age_1_child,degree,wrkstat,sex,div_sep,age_age_1_child,children_age_1_child,sibs_children,degree_wrkstat,wrkstat_sex,age_1_child_sex,age2,age3,age_sqrt
0,1,0,22,0,3,1,1,0,0,0,0,3,1,0,484,10648,4.690416
1,2,0,21,0,1,1,1,0,0,0,0,1,1,0,441,9261,4.582576
2,1,2,42,32,1,1,1,0,1344,64,2,1,1,32,1764,74088,6.480741
3,2,2,49,24,1,0,0,1,1176,48,4,0,0,0,2401,117649,7.0
4,0,3,70,24,3,0,0,1,1680,72,0,0,0,0,4900,343000,8.3666


## Create models
### Split into train and test set

In [20]:
y_all = gss_features['div_sep']
x_all = gss_features.drop('div_sep', axis=1)

In [21]:
x_train, x_test, y_train, y_test = sklearn.model_selection.train_test_split(x_all, y_all, test_size=0.3)

### Vanilla regression

In [22]:
lr_1 = sm.Logit(y_train, x_train)
result_1 = lr_1.fit()
pred_1 = result_1.predict(x_train)
pred_y_1 = np.where(pred_1 < .5, 0, 1)
table_1 = pd.crosstab(y_train, pred_y_1)
print(table_1)
print('Accuracy: ')
print('{0:.3f}'.format((table_1.iloc[0,0] + table_1.iloc[1, 1]) / table_1.sum().sum()))

Optimization terminated successfully.
         Current function value: 0.553785
         Iterations 7
col_0       0    1
div_sep           
0        1919  291
1         723  391
Accuracy: 
0.695


In [23]:
result_1.summary()

0,1,2,3
Dep. Variable:,div_sep,No. Observations:,3324.0
Model:,Logit,Df Residuals:,3308.0
Method:,MLE,Df Model:,15.0
Date:,"Fri, 19 Apr 2019",Pseudo R-squ.:,0.1317
Time:,18:44:42,Log-Likelihood:,-1840.8
converged:,True,LL-Null:,-2119.9
,,LLR p-value:,2.588e-109

0,1,2,3,4,5,6
,coef,std err,z,P>|z|,[0.025,0.975]
sibs,0.0195,0.024,0.800,0.424,-0.028,0.067
children,0.5910,0.095,6.230,0.000,0.405,0.777
age,0.9338,0.168,5.544,0.000,0.604,1.264
age_1_child,0.0080,0.016,0.487,0.626,-0.024,0.040
degree,-0.1301,0.054,-2.405,0.016,-0.236,-0.024
wrkstat,0.0968,0.155,0.625,0.532,-0.207,0.400
sex,-0.1748,0.195,-0.898,0.369,-0.556,0.207
age_age_1_child,8.827e-05,0.000,0.293,0.770,-0.001,0.001
children_age_1_child,-0.0213,0.004,-4.930,0.000,-0.030,-0.013


In [24]:
## Test on test set

pred_1_test = result_1.predict(x_test)
pred_y_1_test = np.where(pred_1_test < .5, 0, 1)
table_1_test = pd.crosstab(y_test, pred_y_1_test)
print('Accuracy: {0:.3f}'.format((table_1_test.iloc[0, 0] + table_1_test.iloc[1, 1])/table_1_test.sum().sum()))

Accuracy: 0.688


### Ridge regression

In [25]:
## First attempt - set C as very large

lr_2 = linear_model.LogisticRegression(C=1e9, solver='liblinear')
fit_2 = lr_2.fit(x_train, y_train)
pred_y_2 = lr_2.predict(x_train)
print(pd.crosstab(y_train, pred_y_2))
print('Accuracy: {0:.3f}'.format(lr_2.score(x_train, y_train)))

col_0       0    1
div_sep           
0        1916  294
1         751  363
Accuracy: 0.686


In [26]:
## Use cross validation to experiment with C

def lr_cost(c_min, c_max, c_step):
    for i in np.arange(c_min, c_max, c_step):
        lr = linear_model.LogisticRegression(C=i, solver='liblinear')
        lr_score = sklearn.model_selection.cross_val_score(lr, x_train, y_train, cv=5)
        print('\nlr with C = {0:0f} \nValues = {1!s} \nMean = {2:.3f}'.format(i, str(lr_score), lr_score.mean()))

In [27]:
lr_cost(0.01, 2, 0.1)


lr with C = 0.010000 
Values = [0.64661654 0.7037594  0.66466165 0.68721805 0.64457831] 
Mean = 0.669

lr with C = 0.110000 
Values = [0.64962406 0.7037594  0.69022556 0.68421053 0.67921687] 
Mean = 0.681

lr with C = 0.210000 
Values = [0.64210526 0.69172932 0.69022556 0.68571429 0.67018072] 
Mean = 0.676

lr with C = 0.310000 
Values = [0.64511278 0.7037594  0.66466165 0.6962406  0.67620482] 
Mean = 0.677

lr with C = 0.410000 
Values = [0.64962406 0.7037594  0.68421053 0.68120301 0.65512048] 
Mean = 0.675

lr with C = 0.510000 
Values = [0.66315789 0.69924812 0.67819549 0.6962406  0.67018072] 
Mean = 0.681

lr with C = 0.610000 
Values = [0.64661654 0.7037594  0.69022556 0.68721805 0.67921687] 
Mean = 0.681

lr with C = 0.710000 
Values = [0.6481203  0.7037594  0.66165414 0.6962406  0.68524096] 
Mean = 0.679

lr with C = 0.810000 
Values = [0.64962406 0.7037594  0.66466165 0.68721805 0.68524096] 
Mean = 0.678

lr with C = 0.910000 
Values = [0.6481203  0.7037594  0.66466165 0.68721

In [28]:
lr_cost(0.001, 0.1, 0.01)


lr with C = 0.001000 
Values = [0.64661654 0.69473684 0.68120301 0.69323308 0.6686747 ] 
Mean = 0.677

lr with C = 0.011000 
Values = [0.64962406 0.7037594  0.6887218  0.68120301 0.68524096] 
Mean = 0.682

lr with C = 0.021000 
Values = [0.6481203  0.68571429 0.66466165 0.68120301 0.65512048] 
Mean = 0.667

lr with C = 0.031000 
Values = [0.64360902 0.7037594  0.68571429 0.68120301 0.68373494] 
Mean = 0.680

lr with C = 0.041000 
Values = [0.6481203  0.7037594  0.66466165 0.68120301 0.65813253] 
Mean = 0.671

lr with C = 0.051000 
Values = [0.6481203  0.70225564 0.67819549 0.69473684 0.67319277] 
Mean = 0.679

lr with C = 0.061000 
Values = [0.64962406 0.70676692 0.67819549 0.68571429 0.6686747 ] 
Mean = 0.678

lr with C = 0.071000 
Values = [0.64962406 0.70526316 0.69022556 0.68421053 0.6686747 ] 
Mean = 0.680

lr with C = 0.081000 
Values = [0.64962406 0.7037594  0.68421053 0.68120301 0.68524096] 
Mean = 0.681

lr with C = 0.091000 
Values = [0.65112782 0.70225564 0.69022556 0.69624

In [29]:
lr_cost(0.00001, 0.0001, 0.00001 )


lr with C = 0.000010 
Values = [0.66466165 0.66315789 0.66616541 0.66616541 0.66415663] 
Mean = 0.665

lr with C = 0.000020 
Values = [0.66315789 0.66466165 0.66466165 0.66616541 0.66566265] 
Mean = 0.665

lr with C = 0.000030 
Values = [0.66165414 0.66466165 0.66616541 0.66616541 0.66114458] 
Mean = 0.664

lr with C = 0.000040 
Values = [0.66015038 0.66616541 0.66466165 0.66616541 0.66114458] 
Mean = 0.664

lr with C = 0.000050 
Values = [0.66015038 0.66315789 0.66466165 0.66766917 0.65512048] 
Mean = 0.662

lr with C = 0.000060 
Values = [0.66015038 0.66466165 0.66616541 0.66766917 0.65662651] 
Mean = 0.663

lr with C = 0.000070 
Values = [0.66015038 0.66616541 0.66766917 0.66616541 0.65813253] 
Mean = 0.664

lr with C = 0.000080 
Values = [0.6556391  0.66766917 0.66766917 0.66466165 0.65963855] 
Mean = 0.663

lr with C = 0.000090 
Values = [0.65864662 0.66917293 0.67368421 0.66466165 0.65963855] 
Mean = 0.665


In [30]:
## Go for C = 0.031 as is highest mean.

## Retrain on this cost

In [31]:
lr_2a = linear_model.LogisticRegression(C=0.031, solver='liblinear')
fit_lr_2a = lr_2a.fit(x_train, y_train)
pred_y_2a = lr_2a.predict(x_train)
print(pd.crosstab(y_train, pred_y_2a))
print('Accuracy: {0:.3f}'.format(lr_2a.score(x_train, y_train)))

col_0       0    1
div_sep           
0        1892  318
1         742  372
Accuracy: 0.681


In [32]:
## Test on test set

pred_y_2a_test = lr_2a.predict(x_test)
print(pd.crosstab(y_test, pred_y_2a_test))
print('Accuracy: {0:.3f}'.format(lr_2a.score(x_test, y_test)))

col_0      0    1
div_sep          
0        817  119
1        342  147
Accuracy: 0.676


### Lasso regression

In [33]:
## First attempt with very high C

lr_3 = linear_model.LogisticRegression(C=1e9, penalty='l1', solver='liblinear')
fit_lr_3 = lr_3.fit(x_train, y_train)
pred_y_3 = lr_3.predict(x_train)
print(pd.crosstab(y_train, pred_y_3))
print('Accuracy: {0:.3f}'.format(lr_3.score(x_train, y_train)))

col_0       0    1
div_sep           
0        1898  312
1         706  408
Accuracy: 0.694


In [34]:
lr_3.coef_

array([[ 1.94889933e-02,  5.59562156e-01,  8.35768432e-02,
         1.29266388e-02, -1.42244741e-01,  1.01984477e-01,
        -1.64262361e-01,  8.28101528e-07, -1.96682203e-02,
        -1.64927924e-02,  5.50039304e-02, -5.22683959e-01,
         1.83788836e-02,  9.39421646e-04, -1.93133922e-05,
         2.29495349e-01]])

In [35]:
## Function to change cost

def lr_3_cost(c_min, c_max, c_step):
    for i in np.arange(c_min, c_max, c_step):
        lr = linear_model.LogisticRegression(C=i, penalty='l1', solver='liblinear', max_iter=500)
        lr_score = sklearn.model_selection.cross_val_score(lr, x_train, y_train, cv=5)
        print('\nLasso with C = {0:.3f} \nValues = {1} \nMean = {2:.3f}'.format(i, lr_score, lr_score.mean()))

In [36]:
lr_3_cost(0.01, 3, 0.1)


Lasso with C = 0.010 
Values = [0.64962406 0.6962406  0.68421053 0.68421053 0.66114458] 
Mean = 0.675

Lasso with C = 0.110 
Values = [0.67518797 0.72932331 0.67518797 0.68571429 0.67620482] 
Mean = 0.688

Lasso with C = 0.210 
Values = [0.68721805 0.72781955 0.67669173 0.68421053 0.67319277] 
Mean = 0.690

Lasso with C = 0.310 
Values = [0.69323308 0.72631579 0.67969925 0.67669173 0.67319277] 
Mean = 0.690

Lasso with C = 0.410 
Values = [0.68421053 0.72330827 0.68270677 0.67819549 0.6746988 ] 
Mean = 0.689

Lasso with C = 0.510 
Values = [0.68721805 0.72030075 0.67819549 0.68270677 0.6746988 ] 
Mean = 0.689

Lasso with C = 0.610 
Values = [0.68721805 0.72030075 0.68270677 0.68120301 0.67319277] 
Mean = 0.689

Lasso with C = 0.710 
Values = [0.68721805 0.71729323 0.68270677 0.68120301 0.67319277] 
Mean = 0.688

Lasso with C = 0.810 
Values = [0.68721805 0.71729323 0.68270677 0.68270677 0.67319277] 
Mean = 0.689

Lasso with C = 0.910 
Values = [0.68721805 0.71729323 0.68270677 0.68120

In [44]:
lr_3_cost(0.01, 0.1, 0.01)


Lasso with C = 0.010 
Values = [0.64962406 0.6962406  0.68571429 0.68421053 0.65963855] 
Mean = 0.675

Lasso with C = 0.020 
Values = [0.64511278 0.69172932 0.67819549 0.6887218  0.67319277] 
Mean = 0.675

Lasso with C = 0.030 
Values = [0.65263158 0.70075188 0.68421053 0.69323308 0.68072289] 
Mean = 0.682

Lasso with C = 0.040 
Values = [0.66165414 0.71278195 0.68421053 0.69172932 0.68524096] 
Mean = 0.687

Lasso with C = 0.050 
Values = [0.66766917 0.71578947 0.68421053 0.69473684 0.68222892] 
Mean = 0.689

Lasso with C = 0.060 
Values = [0.67218045 0.72030075 0.67819549 0.68721805 0.68072289] 
Mean = 0.688

Lasso with C = 0.070 
Values = [0.67067669 0.72180451 0.68120301 0.6887218  0.68072289] 
Mean = 0.689

Lasso with C = 0.080 
Values = [0.67067669 0.72180451 0.67518797 0.6887218  0.68222892] 
Mean = 0.688

Lasso with C = 0.090 
Values = [0.67218045 0.72330827 0.67368421 0.68571429 0.67620482] 
Mean = 0.686


In [45]:
## Choose 0.05 as this is where it starts to stablize around 69% accuracy

## Retrain model

lr_3a = linear_model.LogisticRegression(C=0.05, solver='liblinear', penalty='l1', max_iter=500)
lr_3a_fit = lr_3a.fit(x_train, y_train)
y_pred_3a = lr_3a.predict(x_train)
print(pd.crosstab(y_train, y_pred_3a))
print('Accuracy {0:.3f}'.format(lr_3a.score(x_train, y_train)))

col_0       0    1
div_sep           
0        1892  318
1         715  399
Accuracy 0.689


In [46]:
lr_3a.coef_

array([[ 0.00000000e+00,  3.59536888e-01, -1.16748333e-02,
         1.68099598e-02, -1.11631630e-01,  0.00000000e+00,
         0.00000000e+00, -1.54722321e-06, -1.15236679e-02,
        -7.73026769e-03,  0.00000000e+00, -1.50809691e-01,
         1.79929851e-03,  4.28313886e-03, -4.05298906e-05,
        -7.88781673e-01]])

In [47]:
lr_3.coef_

array([[ 1.94889933e-02,  5.59562156e-01,  8.35768432e-02,
         1.29266388e-02, -1.42244741e-01,  1.01984477e-01,
        -1.64262361e-01,  8.28101528e-07, -1.96682203e-02,
        -1.64927924e-02,  5.50039304e-02, -5.22683959e-01,
         1.83788836e-02,  9.39421646e-04, -1.93133922e-05,
         2.29495349e-01]])

In [55]:
x_test.columns

Index(['sibs', 'children', 'age', 'age_1_child', 'degree', 'wrkstat', 'sex',
       'age_age_1_child', 'children_age_1_child', 'sibs_children',
       'degree_wrkstat', 'wrkstat_sex', 'age_1_child_sex', 'age2', 'age3',
       'age_sqrt'],
      dtype='object')

In [48]:
## Test on test set

y_pred_3a_test = lr_3a.predict(x_test)
print(pd.crosstab(y_test, y_pred_3a_test))
print('Accuracy {0:.3f}'.format(lr_3a.score(x_test, y_test)))

col_0      0    1
div_sep          
0        821  115
1        326  163
Accuracy 0.691


## Evaluation

In [54]:
print('Accuracy vanilla: {0:.3f}'.format((table_1_test.iloc[0, 0] + table_1_test.iloc[1, 1])/table_1_test.sum().sum()))
print('Accuracy ridge: {0:.3f}'.format(lr_2a.score(x_test, y_test)))
print('Accuracy lasso: {0:.3f}'.format(lr_3a.score(x_test, y_test)))
print('Accuracy guessing no divorces: {0:.3f}'.format(sum(y_test == 0) / len(y_test)))

Accuracy vanilla: 0.688
Accuracy ridge: 0.676
Accuracy lasso: 0.691
Accuracy guessing no divorces: 0.657


All models did little better than simply guessing. However, the lasso did do marginally better than the others, potentially due to the reduction in overfitting.

4 of the coefficients were reduced to zero in the lasso regression. These were number of siblings, work status, sex and (degree x work status). This suggests these variables are not necessary for prediction. The fact that all the coefficients are quite small and the accuracy is not great suggests that none are great predictors though.

Additional variables should be considered to see if this can improve the accuracy of the model because at the current time it is not much better than guessing.

In [59]:
print('Vanilla:')
print(pd.crosstab(y_test, pred_y_1_test))
print('\nRidge:')
print(pd.crosstab(y_test, pred_y_2a_test))
print('\nLasso:')
print(pd.crosstab(y_test, y_pred_3a_test))

Vanilla:
col_0      0    1
div_sep          
0        813  123
1        322  167

Ridge:
col_0      0    1
div_sep          
0        817  119
1        342  147

Lasso:
col_0      0    1
div_sep          
0        821  115
1        326  163


The lasso model did better than the vanilla by guessing more people were not divorced - i.e. towards the bias in the data. Suggests that maybe the model is really not great!