In [25]:
# Import necessary libraries
import pandas as pd
import numpy as np
import sklearn
from sklearn.model_selection import RandomizedSearchCV
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.linear_model import Lasso
from sklearn.linear_model import Ridge
from scipy.stats import randint as sp_randint
from scipy.stats import uniform as sp_uniform
import lucem_illud_2020

In [2]:
# Read the data
'''
dtype_dic= {'year': pd.Int64Dtype(), 
            'state_cate': pd.Int64Dtype(),
            'age_cate': pd.Int64Dtype(),
            'age': pd.Int64Dtype(),
            'female_dummy': pd.Int64Dtype(),
            'nonwhite_dummy': pd.Int64Dtype(),
            'marital_dummy': pd.Int64Dtype(),
            'educ_cate': pd.Int64Dtype(),
            'high_school_dummy': pd.Int64Dtype(),
            'college_dummy': pd.Int64Dtype(),
            'graduate_dummy': pd.Int64Dtype(),
            'income_cate': pd.Int64Dtype(),
            'income': pd.Int64Dtype(),
            'precaution_dummy': pd.Int64Dtype(),
            'retire_dummy': pd.Int64Dtype(),
            'fin_par_dummy': pd.Int64Dtype(),
            'math_perceived_cate': pd.Int64Dtype(),
            'fin_perceived_cate': pd.Int64Dtype(),
            'interest_q': pd.Int64Dtype(),
            'inflation_q': pd.Int64Dtype(),
            'bond_q': pd.Int64Dtype(),
            'compound_q': pd.Int64Dtype(),
            'mortgage_q': pd.Int64Dtype(),
            'mutual_q': pd.Int64Dtype()}
'''
df = pd.read_csv("processed_NFCS.csv")
df.head(15)

Unnamed: 0,ID,year,weights,state_cate,state_dummy_1,state_dummy_2,state_dummy_3,state_dummy_4,state_dummy_5,state_dummy_6,...,precaution_dummy,retire_dummy,fin_par_dummy,math_perceived_cate,fin_perceived_cate,interest_q,inflation_q,bond_q,mortgage_q,mutual_q
0,2012010001,2012,0.363417,24,0,0,0,0,0,0,...,0,0,1,7,6,1,1,3,1,1
1,2012010002,2012,1.173593,10,0,0,0,0,0,0,...,0,0,0,6,5,1,2,2,2,2
2,2012010003,2012,1.577671,23,0,0,0,0,0,0,...,1,0,0,7,5,1,1,2,1,1
3,2012010004,2012,1.577671,14,0,0,0,0,0,0,...,0,0,1,4,4,3,2,2,2,2
4,2012010005,2012,2.167569,44,0,0,0,0,0,0,...,1,0,0,6,6,1,3,2,1,3
5,2012010006,2012,0.513483,25,0,0,0,0,0,0,...,0,0,1,1,6,1,3,3,1,3
6,2012010007,2012,2.234989,31,0,0,0,0,0,0,...,1,1,1,7,6,3,1,3,1,1
7,2012010008,2012,0.753603,22,0,0,0,0,0,0,...,1,0,0,5,3,1,1,3,1,1
8,2012010009,2012,1.576065,5,0,0,0,0,1,0,...,1,0,0,5,4,3,1,1,1,2
9,2012010010,2012,1.466195,4,0,0,0,1,0,0,...,0,0,0,2,5,1,3,3,1,2


In [3]:
# Generate all correct and all incorrect indicator
correct_indicator = (df[['interest_q', 'inflation_q', 'bond_q', 'mortgage_q', 'mutual_q']] == 1).sum(axis=1) == 5
df['q_all_correct'] = correct_indicator
incorrect_indicator = (df[['interest_q', 'inflation_q', 'bond_q', 'mortgage_q', 'mutual_q']] == 1).sum(axis=1) == 0
df['q_all_incorrect'] = incorrect_indicator

In [4]:
# Construct pre-trained overconfidence measure
df['overconfidence'] = np.nan
one_indicator = df['q_all_incorrect'] & (df['math_perceived_cate'] > 5) & (df['fin_perceived_cate'] > 5)
zero_indicator = ((df['q_all_incorrect'] & (df['math_perceived_cate'] < 3) & (df['fin_perceived_cate'] < 3)) | 
                  (df['q_all_correct'] & (df['math_perceived_cate'] > 5) & (df['fin_perceived_cate'] > 5)))
df.loc[one_indicator, 'overconfidence'] = 1
df.loc[zero_indicator, 'overconfidence'] = 0

In [10]:
# Construct a dataframe for machine learning
ml_df = df.loc[df['overconfidence'].notnull(), :]
ml_df

Unnamed: 0,ID,year,weights,state_cate,state_dummy_1,state_dummy_2,state_dummy_3,state_dummy_4,state_dummy_5,state_dummy_6,...,math_perceived_cate,fin_perceived_cate,interest_q,inflation_q,bond_q,mortgage_q,mutual_q,q_all_correct,q_all_incorrect,overconfidence
46,2012010047,2012,1.679737,5,0,0,0,0,1,0,...,7,6,1,1,1,1,1,True,False,0.0
81,2012010082,2012,2.234989,31,0,0,0,0,0,0,...,7,6,1,1,1,1,1,True,False,0.0
84,2012010085,2012,1.178272,14,0,0,0,0,0,0,...,7,6,1,1,1,1,1,True,False,0.0
99,2012010100,2012,1.697046,50,0,0,0,0,0,0,...,7,6,1,1,1,1,1,True,False,0.0
113,2012010114,2012,1.086083,47,0,0,0,0,0,0,...,7,7,1,1,1,1,1,True,False,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
80108,2018037036,2018,0.385926,51,0,0,0,0,0,0,...,6,6,1,1,1,1,1,True,False,0.0
80113,2018037041,2018,0.469334,51,0,0,0,0,0,0,...,7,6,1,1,1,1,1,True,False,0.0
80121,2018037049,2018,0.404002,51,0,0,0,0,0,0,...,6,6,1,1,1,1,1,True,False,0.0
80132,2018037060,2018,0.485971,51,0,0,0,0,0,0,...,7,6,1,1,1,1,1,True,False,0.0


In [23]:
# Construct a dataframe for out-of-sample prediction
pr_df = df.loc[~df['overconfidence'].notnull(), :]
pr_df

Unnamed: 0,ID,year,weights,state_cate,state_dummy_1,state_dummy_2,state_dummy_3,state_dummy_4,state_dummy_5,state_dummy_6,...,math_perceived_cate,fin_perceived_cate,interest_q,inflation_q,bond_q,mortgage_q,mutual_q,q_all_correct,q_all_incorrect,overconfidence
0,2012010001,2012,0.363417,24,0,0,0,0,0,0,...,7,6,1,1,3,1,1,False,False,
1,2012010002,2012,1.173593,10,0,0,0,0,0,0,...,6,5,1,2,2,2,2,False,False,
2,2012010003,2012,1.577671,23,0,0,0,0,0,0,...,7,5,1,1,2,1,1,False,False,
3,2012010004,2012,1.577671,14,0,0,0,0,0,0,...,4,4,3,2,2,2,2,False,True,
4,2012010005,2012,2.167569,44,0,0,0,0,0,0,...,6,6,1,3,2,1,3,False,False,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
80159,2018037087,2018,0.404002,51,0,0,0,0,0,0,...,4,5,1,3,3,1,1,False,False,
80160,2018037088,2018,0.475130,20,0,0,0,0,0,0,...,2,4,1,3,2,2,2,False,False,
80161,2018037089,2018,0.531368,20,0,0,0,0,0,0,...,5,5,1,1,3,2,2,False,False,
80162,2018037090,2018,0.377318,20,0,0,0,0,0,0,...,7,4,1,3,3,1,1,False,False,


### Logistic Regression

In [16]:
# Construct dependent and independent variable
state_dummy_list = ['state_dummy_{}'.format(i) for i in range(1, 52)]
X_list = state_dummy_list + ['age', 'female_dummy', 'nonwhite_dummy', 'marital_dummy', 'high_school_dummy', 
                             'college_dummy', 'graduate_dummy', 'income', 'math_perceived_cate', 
                             'fin_perceived_cate', 'interest_q', 'inflation_q', 'bond_q', 'mortgage_q', 'mutual_q']
X = ml_df[X_list]
y = ml_df['overconfidence']

In [20]:
# Specify parameter distributions as suggested
param_dist_1 = {'penalty': ['l1', 'l2'],
                'C': sp_uniform(0.1, 10.0)}

overconfidence_clf_1 = LogisticRegression(random_state=25)

# Run randomized hyperparameter search
random_search_1 = RandomizedSearchCV(overconfidence_clf_1, param_distributions=param_dist_1,
                                     n_iter=5000, n_jobs=-1, cv=5, random_state=25,
                                     scoring='neg_mean_squared_error')

random_search_1.fit(X, y, sample_weight=ml_df['weights'])
print('The optimal tuning parameter values from randomized hyperparameter search are\n',
      random_search_1.best_params_)
print('The MSE of the optimal results is', -random_search_1.best_score_)

The optimal tuning parameter values from randomized hyperparameter search are
 {'C': 0.2026530337554643, 'penalty': 'l2'}
The MSE of the optimal results is 0.05320213809533343


In [22]:
#
overconfidence_clf_1o = LogisticRegression(penalty=random_search_1.best_params_['penalty'], 
                                           C=random_search_1.best_params_['C'],
                                           random_state=25)
overconfidence_clf_1o.fit(X, y, sample_weight=ml_df['weights'])

LogisticRegression(C=0.2026530337554643, class_weight=None, dual=False,
                   fit_intercept=True, intercept_scaling=1, l1_ratio=None,
                   max_iter=100, multi_class='auto', n_jobs=None, penalty='l2',
                   random_state=25, solver='lbfgs', tol=0.0001, verbose=0,
                   warm_start=False)

### Random Forest

In [26]:
# Specify parameter distributions as suggested
param_dist_2 = {'n_estimators': sp_randint(10, 200),
                'max_depth': sp_randint(2, 4),
                'min_samples_split': sp_randint(2, 20),
                'min_samples_leaf': sp_randint(2, 20),
                'max_features': sp_randint(1, 4)}

overconfidence_clf_2 = RandomForestClassifier(bootstrap=True, oob_score=True, random_state=25)

# Run randomized hyperparameter search
random_search_2 = RandomizedSearchCV(overconfidence_clf_2, param_distributions=param_dist_2,
                                     n_iter=5000, n_jobs=-1, cv=5, random_state=25,
                                     scoring='neg_mean_squared_error')

random_search_2.fit(X, y)
print('The optimal tuning parameter values from randomized hyperparameter search are\n',
      random_search_2.best_params_)
print('The MSE of the optimal results is', -random_search_2.best_score_)

KeyboardInterrupt: 

### Support Vector Machine

In [None]:
# Specify parameter distributions as suggested
param_dist_3 = {'C': sp_uniform(loc=0.1, scale=10.0),
                'gamma': ['scale', 'auto'],
                'shrinking': [True, False]}

overconfidence_clf_3 = SVC(kernel='rbf', random_state=25)

# Run randomized hyperparameter search
random_search_3 = RandomizedSearchCV(overconfidence_clf_3, param_distributions=param_dist_3,
                                     n_iter=5000, n_jobs=-1, cv=5, random_state=25,
                                     scoring='neg_mean_squared_error')

random_search_3.fit(X, y)
print('The optimal tuning parameter values from randomized hyperparameter search are\n',
      random_search_3.best_params_)
print('The MSE of the optimal results is', -random_search_3.best_score_)

### Ridge Regression

In [None]:
# Specify parameter distributions as suggested
param_dist_4 = {'alpha': sp_uniform(0.1, 10.0)}

overconfidence_clf_4 = Ridge(random_state=25)

# Run randomized hyperparameter search
random_search_4 = RandomizedSearchCV(overconfidence_clf_4, param_distributions=param_dist_4,
                                     n_iter=5000, n_jobs=-1, cv=5, random_state=25,
                                     scoring='neg_mean_squared_error')

random_search_4.fit(X, y)
print('The optimal tuning parameter values from randomized hyperparameter search are\n',
      random_search_4.best_params_)
print('The MSE of the optimal results is', -random_search_4.best_score_)

### Lasso Regression

In [None]:
# Specify parameter distributions as suggested
param_dist_5 = {'alpha': sp_uniform(0.1, 10.0)}

overconfidence_clf_5 = Lasso(random_state=25)

# Run randomized hyperparameter search
random_search_5 = RandomizedSearchCV(overconfidence_clf_5, param_distributions=param_dist_5,
                                     n_iter=5000, n_jobs=-1, cv=5, random_state=25,
                                     scoring='neg_mean_squared_error')

random_search_5.fit(X, y)
print('The optimal tuning parameter values from randomized hyperparameter search are\n',
      random_search_5.best_params_)
print('The MSE of the optimal results is', -random_search_5.best_score_)