In [1]:
import pandas as pd
import numpy as np

In [2]:
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import roc_curve, roc_auc_score

RANDOM_SEED = 6

In [73]:
import matplotlib
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.express as px
%matplotlib inline
train_features = pd.read_csv("training_set_features.csv")

In [4]:
test_df = pd.read_csv("test_set_features.csv")

train_label = pd.read_csv("training_set_labels.csv")

In [74]:
train_features.drop('respondent_id', axis=1, inplace=True)
train_features.head()

Unnamed: 0,xyz_concern,xyz_knowledge,behavioral_antiviral_meds,behavioral_avoidance,behavioral_face_mask,behavioral_wash_hands,behavioral_large_gatherings,behavioral_outside_home,behavioral_touch_face,doctor_recc_xyz,...,income_poverty,marital_status,rent_or_own,employment_status,hhs_geo_region,census_msa,household_adults,household_children,employment_industry,employment_occupation
0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,...,Below Poverty,Not Married,Own,Not in Labor Force,oxchjgsf,Non-MSA,0.0,0.0,,
1,3.0,2.0,0.0,1.0,0.0,1.0,0.0,1.0,1.0,0.0,...,Below Poverty,Not Married,Rent,Employed,bhuqouqj,"MSA, Not Principle City",0.0,0.0,pxcmvdjn,xgwztkwe
2,1.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,,...,"<= $75,000, Above Poverty",Not Married,Own,Employed,qufhixun,"MSA, Not Principle City",2.0,0.0,rucpziij,xtkaffoo
3,1.0,1.0,0.0,1.0,0.0,1.0,1.0,0.0,0.0,0.0,...,Below Poverty,Not Married,Rent,Not in Labor Force,lrircsnp,"MSA, Principle City",0.0,0.0,,
4,2.0,1.0,0.0,1.0,0.0,1.0,1.0,0.0,1.0,0.0,...,"<= $75,000, Above Poverty",Married,Own,Employed,qufhixun,"MSA, Not Principle City",1.0,0.0,wxleyezf,emcorrxb


In [75]:
numeric_cols = train_features.columns[train_features.dtypes != "object"].values


In [76]:
from sklearn.impute import KNNImputer

In [77]:
knn = KNNImputer(n_neighbors=5,missing_values=np.nan)

In [78]:
knn.fit(train_features[numeric_cols]) 

In [79]:
train_features[numeric_cols] = knn.transform(train_features[numeric_cols])


In [80]:
train_features.isna().sum()

xyz_concern                        0
xyz_knowledge                      0
behavioral_antiviral_meds          0
behavioral_avoidance               0
behavioral_face_mask               0
behavioral_wash_hands              0
behavioral_large_gatherings        0
behavioral_outside_home            0
behavioral_touch_face              0
doctor_recc_xyz                    0
doctor_recc_seasonal               0
chronic_med_condition              0
child_under_6_months               0
health_worker                      0
health_insurance                   0
opinion_xyz_vacc_effective         0
opinion_xyz_risk                   0
opinion_xyz_sick_from_vacc         0
opinion_seas_vacc_effective        0
opinion_seas_risk                  0
opinion_seas_sick_from_vacc        0
age_group                          0
education                       1407
race                               0
sex                                0
income_poverty                  4423
marital_status                  1408
r

In [12]:
train_features.head()

Unnamed: 0,xyz_concern,xyz_knowledge,behavioral_antiviral_meds,behavioral_avoidance,behavioral_face_mask,behavioral_wash_hands,behavioral_large_gatherings,behavioral_outside_home,behavioral_touch_face,doctor_recc_xyz,...,income_poverty,marital_status,rent_or_own,employment_status,hhs_geo_region,census_msa,household_adults,household_children,employment_industry,employment_occupation
0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,...,Below Poverty,Not Married,Own,Not in Labor Force,oxchjgsf,Non-MSA,0.0,0.0,,
1,3.0,2.0,0.0,1.0,0.0,1.0,0.0,1.0,1.0,0.0,...,Below Poverty,Not Married,Rent,Employed,bhuqouqj,"MSA, Not Principle City",0.0,0.0,pxcmvdjn,xgwztkwe
2,1.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,...,"<= $75,000, Above Poverty",Not Married,Own,Employed,qufhixun,"MSA, Not Principle City",2.0,0.0,rucpziij,xtkaffoo
3,1.0,1.0,0.0,1.0,0.0,1.0,1.0,0.0,0.0,0.0,...,Below Poverty,Not Married,Rent,Not in Labor Force,lrircsnp,"MSA, Principle City",0.0,0.0,,
4,2.0,1.0,0.0,1.0,0.0,1.0,1.0,0.0,1.0,0.0,...,"<= $75,000, Above Poverty",Married,Own,Employed,qufhixun,"MSA, Not Principle City",1.0,0.0,wxleyezf,emcorrxb


In [81]:
mapping_dictionary_value={'Female':0,'Male':1}
train_features['sex_new']=train_features.sex.map(mapping_dictionary_value)
train_features

Unnamed: 0,xyz_concern,xyz_knowledge,behavioral_antiviral_meds,behavioral_avoidance,behavioral_face_mask,behavioral_wash_hands,behavioral_large_gatherings,behavioral_outside_home,behavioral_touch_face,doctor_recc_xyz,...,marital_status,rent_or_own,employment_status,hhs_geo_region,census_msa,household_adults,household_children,employment_industry,employment_occupation,sex_new
0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,...,Not Married,Own,Not in Labor Force,oxchjgsf,Non-MSA,0.0,0.0,,,0
1,3.0,2.0,0.0,1.0,0.0,1.0,0.0,1.0,1.0,0.0,...,Not Married,Rent,Employed,bhuqouqj,"MSA, Not Principle City",0.0,0.0,pxcmvdjn,xgwztkwe,1
2,1.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,...,Not Married,Own,Employed,qufhixun,"MSA, Not Principle City",2.0,0.0,rucpziij,xtkaffoo,1
3,1.0,1.0,0.0,1.0,0.0,1.0,1.0,0.0,0.0,0.0,...,Not Married,Rent,Not in Labor Force,lrircsnp,"MSA, Principle City",0.0,0.0,,,0
4,2.0,1.0,0.0,1.0,0.0,1.0,1.0,0.0,1.0,0.0,...,Married,Own,Employed,qufhixun,"MSA, Not Principle City",1.0,0.0,wxleyezf,emcorrxb,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
26702,2.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,...,Not Married,Own,Not in Labor Force,qufhixun,Non-MSA,0.0,0.0,,,0
26703,1.0,2.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,1.0,...,Not Married,Rent,Employed,lzgpxyit,"MSA, Principle City",1.0,0.0,fcxhlnwr,cmhcxjea,1
26704,2.0,2.0,0.0,1.0,1.0,1.0,1.0,0.0,1.0,0.0,...,Not Married,Own,,lzgpxyit,"MSA, Not Principle City",0.0,0.0,,,0
26705,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,Married,Rent,Employed,lrircsnp,Non-MSA,1.0,0.0,fcxhlnwr,haliazsg,0


In [82]:
train_features.drop('sex',axis=1,inplace=True)

In [15]:
mapping_dictionary_value={'Female':0,'Male':1}
test_df['sex_new']=test_df.sex.map(mapping_dictionary_value)

In [16]:
test_df.drop('sex',axis=1,inplace=True)

In [73]:
train_features.marital_status.unique()

array(['Not Married', 'Married', 'Missing'], dtype=object)

In [234]:
test_df.head()

Unnamed: 0,xyz_concern,xyz_knowledge,behavioral_antiviral_meds,behavioral_avoidance,behavioral_face_mask,behavioral_wash_hands,behavioral_large_gatherings,behavioral_outside_home,behavioral_touch_face,doctor_recc_xyz,...,race,income_poverty,marital_status,rent_or_own,employment_status,hhs_geo_region,census_msa,household_adults,household_children,sex_new
0,0.420051,1.196093,-0.227075,0.618912,-0.272576,0.45994,1.338524,-0.714493,0.691919,-0.544766,...,-1.986034,6863,-1.145032,-2.28997,0.760161,-0.938288,1.265999,0.152265,-0.57774,-0.827124
1,-0.679394,-0.423542,-0.227075,-1.626549,-0.272576,-2.176822,-0.748583,-0.714493,-1.450582,-0.544766,...,0.518443,2573,-1.145032,-2.28997,0.760161,-0.065447,-1.150402,2.816934,-0.57774,1.209008
2,0.420051,1.196093,-0.227075,-1.626549,3.673487,0.45994,1.338524,1.402714,0.691919,-0.544766,...,0.518443,6863,0.902827,-2.289604,0.760161,-1.143737,-1.150402,0.152265,-0.57774,1.209008
3,-0.679394,-0.423542,-0.227075,-1.626549,-0.272576,-2.176822,-0.748583,-0.714493,-1.450582,1.950101,...,0.518443,12775,0.902827,-2.289604,-0.675087,-1.143737,1.265999,0.152265,-0.57774,-0.827124
4,1.519495,-0.423542,4.415775,0.618912,-0.272576,0.45994,1.338524,1.402714,0.691919,-0.544766,...,-1.925843,12775,-1.145032,-2.289604,0.760161,1.843573,-1.150402,-1.180069,0.503604,-0.827124


In [17]:
train_features.isna().sum()

xyz_concern                        0
xyz_knowledge                      0
behavioral_antiviral_meds          0
behavioral_avoidance               0
behavioral_face_mask               0
behavioral_wash_hands              0
behavioral_large_gatherings        0
behavioral_outside_home            0
behavioral_touch_face              0
doctor_recc_xyz                    0
doctor_recc_seasonal               0
chronic_med_condition              0
child_under_6_months               0
health_worker                      0
health_insurance                   0
opinion_xyz_vacc_effective         0
opinion_xyz_risk                   0
opinion_xyz_sick_from_vacc         0
opinion_seas_vacc_effective        0
opinion_seas_risk                  0
opinion_seas_sick_from_vacc        0
age_group                          0
education                       1407
race                               0
income_poverty                  4423
marital_status                  1408
rent_or_own                     2042
e

In [246]:
train_features.shape

(26707, 35)

In [83]:
train_features.employment_status.mode()

0    Employed
Name: employment_status, dtype: object

In [84]:
train_features['employment_status'].fillna('Employed', inplace=True)

In [85]:
train_features.income_poverty.mode()

0    <= $75,000, Above Poverty
Name: income_poverty, dtype: object

In [86]:
train_features['marital_status'].fillna('Married', inplace=True)

In [87]:
train_features.education.mode()

0    College Graduate
Name: education, dtype: object

In [88]:
train_features['education'].fillna('College Graduate', inplace=True)

In [24]:
test_df.employment_status.mode()

0    Employed
Name: employment_status, dtype: object

In [25]:
test_df['employment_status'].fillna('Employed', inplace=True)

In [26]:
test_df.marital_status.mode()

0    Married
Name: marital_status, dtype: object

In [27]:
test_df['marital_status'].fillna('Married', inplace=True)

In [28]:
test_df.education.mode()

0    College Graduate
Name: education, dtype: object

In [29]:
test_df['education'].fillna('College Graduate', inplace=True)

In [30]:
test_df.isna().sum()

respondent_id                      0
xyz_concern                       85
xyz_knowledge                    122
behavioral_antiviral_meds         79
behavioral_avoidance             213
behavioral_face_mask              19
behavioral_wash_hands             40
behavioral_large_gatherings       72
behavioral_outside_home           82
behavioral_touch_face            128
doctor_recc_xyz                 2160
doctor_recc_seasonal            2160
chronic_med_condition            932
child_under_6_months             813
health_worker                    789
health_insurance               12228
opinion_xyz_vacc_effective       398
opinion_xyz_risk                 380
opinion_xyz_sick_from_vacc       375
opinion_seas_vacc_effective      452
opinion_seas_risk                499
opinion_seas_sick_from_vacc      521
age_group                          0
education                          0
race                               0
income_poverty                  4497
marital_status                     0
r

In [31]:
test_df[numeric_cols] = knn.transform(test_df[numeric_cols])


In [263]:
train_features.rent_or_own.unique()

array(['Own', 'Rent', nan], dtype=object)

In [136]:
train_features['rent_or_own'].fillna('Missing',inplace=True)

In [33]:
test_df['rent_or_own'].fillna('Missing',inplace=True)

In [34]:
train_features.rent_or_own.value_counts()

Own        18736
Rent        5929
Missing     2042
Name: rent_or_own, dtype: int64

In [173]:
categorical_cols = train_features.columns[train_features.dtypes == "object"].values


In [174]:
categorical_cols


array(['income_poverty'], dtype=object)

In [177]:
df_map = test_df['income_poverty'].value_counts().to_dict()
df_map

{'<= $75,000, Above Poverty': 12775,
 '> $75,000': 6863,
 'Missing': 4497,
 'Below Poverty': 2573}

In [178]:
test_df['income_poverty']= test_df.income_poverty.map(df_map)

In [273]:
test_df.head()

Unnamed: 0,respondent_id,xyz_concern,xyz_knowledge,behavioral_antiviral_meds,behavioral_avoidance,behavioral_face_mask,behavioral_wash_hands,behavioral_large_gatherings,behavioral_outside_home,behavioral_touch_face,...,marital_status,rent_or_own,employment_status,hhs_geo_region,census_msa,household_adults,household_children,employment_industry,employment_occupation,sex_new
0,26707,2.0,2.0,0.0,1.0,0.0,1.0,1.0,0.0,1.0,...,Not Married,Rent,Employed,2183,"MSA, Not Principle City",1.0,0.0,atmlpfrs,hfxkjkmi,0
1,26708,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,Not Married,Rent,Employed,2867,Non-MSA,3.0,0.0,atmlpfrs,xqwwgdyp,1
2,26709,2.0,2.0,0.0,0.0,1.0,1.0,1.0,1.0,1.0,...,Married,Own,Employed,2022,Non-MSA,1.0,0.0,nduyfdeo,pvmttkik,1
3,26710,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,Married,Own,Not in Labor Force,2022,"MSA, Not Principle City",1.0,0.0,,,0
4,26711,3.0,1.0,1.0,1.0,0.0,1.0,1.0,1.0,1.0,...,Not Married,Own,Employed,4363,Non-MSA,0.0,1.0,fcxhlnwr,mxkfnird,0


In [55]:
train_features.isna().sum()

xyz_concern                        0
xyz_knowledge                      0
behavioral_antiviral_meds          0
behavioral_avoidance               0
behavioral_face_mask               0
behavioral_wash_hands              0
behavioral_large_gatherings        0
behavioral_outside_home            0
behavioral_touch_face              0
doctor_recc_xyz                    0
doctor_recc_seasonal               0
chronic_med_condition              0
child_under_6_months               0
health_worker                      0
health_insurance                   0
opinion_xyz_vacc_effective         0
opinion_xyz_risk                   0
opinion_xyz_sick_from_vacc         0
opinion_seas_vacc_effective        0
opinion_seas_risk                  0
opinion_seas_sick_from_vacc        0
age_group                          0
education                          0
race                               0
income_poverty                  4423
marital_status                     0
rent_or_own                        0
e

In [115]:
numeric_cols = train_features.columns[test_df.dtypes != "object"].values
numeric_cols

array(['xyz_concern', 'xyz_knowledge', 'behavioral_antiviral_meds',
       'behavioral_avoidance', 'behavioral_face_mask',
       'behavioral_wash_hands', 'behavioral_large_gatherings',
       'behavioral_outside_home', 'behavioral_touch_face',
       'doctor_recc_xyz', 'doctor_recc_seasonal', 'chronic_med_condition',
       'child_under_6_months', 'health_worker', 'health_insurance',
       'opinion_xyz_vacc_effective', 'opinion_xyz_risk',
       'opinion_xyz_sick_from_vacc', 'opinion_seas_vacc_effective',
       'opinion_seas_risk', 'opinion_seas_sick_from_vacc', 'age_group',
       'education', 'race', 'marital_status', 'rent_or_own',
       'employment_status', 'hhs_geo_region', 'census_msa',
       'household_adults', 'household_children', 'sex_new'], dtype=object)

In [110]:
scaling = StandardScaler()

In [113]:
test_df.drop('respondent_id', inplace=True,axis=1)

In [179]:
scaling.fit_transform(train_features[numeric_cols])

array([[-0.67939401, -2.04317637, -0.22707496, ..., -1.18006913,
        -0.57773956, -0.82712405],
       [ 1.51949541,  1.19609295, -0.22707496, ..., -1.18006913,
        -0.57773956,  1.20900849],
       [-0.67939401, -0.42354171, -0.22707496, ...,  1.48459988,
        -0.57773956,  1.20900849],
       ...,
       [ 0.4200507 ,  1.19609295, -0.22707496, ..., -1.18006913,
        -0.57773956, -0.82712405],
       [-0.67939401, -0.42354171, -0.22707496, ...,  0.15226538,
        -0.57773956, -0.82712405],
       [-1.77883873, -2.04317637, -0.22707496, ...,  0.15226538,
        -0.57773956,  1.20900849]])

In [180]:
train_features[numeric_cols] = scaling.transform(train_features[numeric_cols])
test_df[numeric_cols] = scaling.transform(test_df[numeric_cols])

In [181]:
train_features.head()

Unnamed: 0,xyz_concern,xyz_knowledge,behavioral_antiviral_meds,behavioral_avoidance,behavioral_face_mask,behavioral_wash_hands,behavioral_large_gatherings,behavioral_outside_home,behavioral_touch_face,doctor_recc_xyz,...,race,income_poverty,marital_status,rent_or_own,employment_status,hhs_geo_region,census_msa,household_adults,household_children,sex_new
0,-0.679394,-2.043176,-0.227075,-1.626549,-0.272576,-2.176822,-0.748583,1.402714,0.691919,-0.544766,...,0.508348,2697,-1.128759,0.645342,-0.635275,-0.075655,-1.042496,-1.180069,-0.57774,-0.827124
1,1.519495,1.196093,-0.227075,0.618912,-0.272576,0.45994,-0.748583,1.402714,0.691919,-0.544766,...,0.508348,2697,-1.128759,-1.360902,0.737248,-0.092245,1.128798,-1.180069,-0.57774,1.209008
2,-0.679394,-0.423542,-0.227075,0.618912,-0.272576,-2.176822,-0.748583,-0.714493,-1.450582,-0.544766,...,0.508348,12777,-1.128759,0.645342,0.737248,0.234433,1.128798,1.4846,-0.57774,1.209008
3,-0.679394,-0.423542,-0.227075,0.618912,-0.272576,0.45994,1.338524,-0.714493,-1.450582,-0.544766,...,0.508348,2697,-1.128759,-1.360902,-0.635275,-1.072276,-0.717315,-1.180069,-0.57774,-0.827124
4,0.420051,-0.423542,-0.227075,0.618912,-0.272576,0.45994,1.338524,-0.714493,0.691919,-0.544766,...,0.508348,12777,0.885929,0.645342,0.737248,0.234433,1.128798,0.152265,-0.57774,-0.827124


In [146]:
categorical_cols = train_features.columns[train_features.dtypes == "object"].values
categorical_cols

array(['income_poverty', 'employment_industry', 'employment_occupation'],
      dtype=object)

In [161]:
train_features.head()

Unnamed: 0,xyz_concern,xyz_knowledge,behavioral_antiviral_meds,behavioral_avoidance,behavioral_face_mask,behavioral_wash_hands,behavioral_large_gatherings,behavioral_outside_home,behavioral_touch_face,doctor_recc_xyz,...,employment_occupation_rcertsgn,employment_occupation_tfqavkke,employment_occupation_ukymxvdu,employment_occupation_uqqtjvyb,employment_occupation_vlluhbov,employment_occupation_xgwztkwe,employment_occupation_xqwwgdyp,employment_occupation_xtkaffoo,employment_occupation_xzmlyyjv,employment_occupation_nan
0,-0.679394,-2.043176,-0.227075,-1.626549,-0.272576,-2.176822,-0.748583,1.402714,0.691919,-0.544766,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
1,1.519495,1.196093,-0.227075,0.618912,-0.272576,0.45994,-0.748583,1.402714,0.691919,-0.544766,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0
2,-0.679394,-0.423542,-0.227075,0.618912,-0.272576,-2.176822,-0.748583,-0.714493,-1.450582,-0.544766,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
3,-0.679394,-0.423542,-0.227075,0.618912,-0.272576,0.45994,1.338524,-0.714493,-1.450582,-0.544766,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
4,0.420051,-0.423542,-0.227075,0.618912,-0.272576,0.45994,1.338524,-0.714493,0.691919,-0.544766,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [236]:
train_features.drop('health_insurance', inplace=True, axis=1)

In [237]:
test_df.drop('health_insurance', inplace=True, axis=1)

In [239]:
train_features.shape

(26707, 32)

In [209]:
train_features.shape

(26707, 33)

array([[0.89543747, 0.10456253],
       [0.91648419, 0.08351581],
       [0.56795998, 0.43204002],
       ...,
       [0.72485048, 0.27514952],
       [0.96669475, 0.03330525],
       [0.37887594, 0.62112406]])

In [242]:
model = LogisticRegression(solver='liblinear',C=1, penalty= 'l2')

In [243]:
model.fit(train_features, train_xyz)

In [244]:
train_xyz = train_label.xyz_vaccine
train_seasonal = train_label.seasonal_vaccine

In [245]:
prediction_xyz = model.predict_proba(test_df)

In [246]:
model.fit(train_features, train_seasonal)

In [247]:
prediction_seasonal = model.predict_proba(test_df)

In [248]:
y_preds = pd.DataFrame(
    {
        "xyz_vaccine": prediction_xyz[:,1],
        "seasonal_vaccine": prediction_seasonal[:,1],
    })
    
print("y_preds.shape:", y_preds.shape)
y_preds = y_preds[1:]
y_preds

y_preds.shape: (26707, 2)


Unnamed: 0,xyz_vaccine,seasonal_vaccine
1,0.124951,0.078869
2,0.410309,0.614514
3,0.322301,0.822006
4,0.092135,0.328922
5,0.493502,0.882455
...,...,...
26702,0.034359,0.132787
26703,0.462094,0.389243
26704,0.185832,0.449000
26705,0.253034,0.273525


In [168]:
train_features.drop(['employment_industry','employment_occupation'], inplace=True, axis=1)

In [169]:
test_df.drop(['employment_industry','employment_occupation'],inplace=True, axis=1)

In [170]:
train_features.income_poverty.unique()

array(['Below Poverty', '<= $75,000, Above Poverty', '> $75,000', nan],
      dtype=object)

In [172]:
train_features['income_poverty'].fillna('Missing',inplace=True)

In [224]:
train_features.drop([''], inplace=True, axis=1)

In [240]:
test_df.drop(test_df.tail(1).index,inplace=True)

In [252]:
model.score(test_df,train_xyz)

0.5664806979443592

In [251]:
model.score(test_df,train_seasonal)

0.506608754259183

In [253]:
from sklearn.feature_selection import SelectKBest,f_classif

In [297]:
selector = SelectKBest(score_func = f_classif,k=20)
X_selected = selector.fit_transform(train_features,train_xyz)
select_indices = selector.get_support(indices=True)
selected_features = train_features.columns[select_indices]
print('Selected Features : ')

for i,feature in enumerate(selected_features):
    print(f"{i+1}.{feature}")

Selected Features : 
1.xyz_concern
2.xyz_knowledge
3.behavioral_antiviral_meds
4.behavioral_avoidance
5.behavioral_face_mask
6.behavioral_wash_hands
7.behavioral_touch_face
8.doctor_recc_xyz
9.doctor_recc_seasonal
10.chronic_med_condition
11.child_under_6_months
12.health_worker
13.opinion_xyz_vacc_effective
14.opinion_xyz_risk
15.opinion_xyz_sick_from_vacc
16.opinion_seas_vacc_effective
17.opinion_seas_risk
18.education
19.marital_status
20.rent_or_own


In [298]:
new_train_df = train_features[selected_features]

In [299]:
new_test_df = test_df[selected_features]


In [300]:
model = LogisticRegression(solver='liblinear',C=1, penalty= 'l2')

In [301]:
model.fit(new_train_df, train_xyz)

In [302]:
prediction_xyz = model.predict_proba(new_test_df)

In [303]:
model.score(new_test_df,train_xyz)

0.7206350395027521

In [308]:
model.fit(new_train_df, train_seasonal)

In [309]:
prediction_seasonal = model.predict_proba(new_test_df)

In [310]:
model.score(new_test_df,train_seasonal)

0.5064589807915528

In [312]:
y_preds = pd.DataFrame(
    {
        "xyz_vaccine": prediction_xyz[:,1],
        "seasonal_vaccine": prediction_seasonal[:,1],
    })
    
print("y_preds.shape:", y_preds.shape)
y_preds = y_preds[1:]
y_preds

y_preds.shape: (26707, 2)


Unnamed: 0,xyz_vaccine,seasonal_vaccine
1,0.052978,0.103976
2,0.430925,0.624331
3,0.418867,0.695282
4,0.232230,0.386599
5,0.424711,0.836144
...,...,...
26702,0.064438,0.099395
26703,0.330977,0.469851
26704,0.113060,0.492927
26705,0.150116,0.235855


In [313]:
y_preds.to_csv('final_output.csv')

In [326]:
y_preds.shape

(26706, 2)