First, we have to import the the training data sets

training_set_labels and training_set_features

In [8]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns


from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier  
from sklearn.metrics import accuracy_score, classification_report


# %mathplotlib inline

# displays all the columns
pd.set_option('display.max_columns', None)
plt.rcParams["figure.figsize"] = (18, 8);

t_features= pd.read_csv('dataset and all/training_set_features.csv')
t_labels=pd.read_csv('dataset and all/training_set_labels.csv')
X=t_features
y1=t_labels['xyz_vaccine']
y2=t_labels['seasonal_vaccine']
X_train, X_test, y1_train, y1_test, y2_train, y2_test = train_test_split(X, y1, y2, test_size=0.2, random_state=42)

print(X_train.shape)
# print(t_features.head)
print(X_test.shape)
# print(t_labels.columns)
# print(t_labels.head)



(21365, 36)
(5342, 36)


Now, lets get started on cleaning up the data

First, we handle the categorical data to make it easier to understand

In [9]:
# finding unique values in each column
for col in X_test.columns:
    print(col, X_test[col].unique())

respondent_id [15772  9407 16515 ...  5045  6340 12943]
xyz_concern [ 2.  3.  1.  0. nan]
xyz_knowledge [ 1.  0.  2. nan]
behavioral_antiviral_meds [ 0.  1. nan]
behavioral_avoidance [ 1. nan  0.]
behavioral_face_mask [ 1.  0. nan]
behavioral_wash_hands [ 1.  0. nan]
behavioral_large_gatherings [ 1.  0. nan]
behavioral_outside_home [ 1.  0. nan]
behavioral_touch_face [ 1.  0. nan]
doctor_recc_xyz [ 0. nan  1.]
doctor_recc_seasonal [ 0. nan  1.]
chronic_med_condition [nan  0.  1.]
child_under_6_months [nan  0.  1.]
health_worker [nan  0.  1.]
health_insurance [nan  1.  0.]
opinion_xyz_vacc_effective [nan  4.  5.  3.  2.  1.]
opinion_xyz_risk [nan  2.  4.  1.  3.  5.]
opinion_xyz_sick_from_vacc [nan  2.  4.  1.  5.  3.]
opinion_seas_vacc_effective [nan  4.  5.  2.  1.  3.]
opinion_seas_risk [nan  4.  2.  1.  5.  3.]
opinion_seas_sick_from_vacc [nan  1.  2.  4.  5.  3.]
age_group ['18 - 34 Years' '35 - 44 Years' '45 - 54 Years' '55 - 64 Years'
 '65+ Years']
education [nan 'College Graduat

In [46]:
mapping_age_group = {"18 - 34 Years": 1, "35 - 44 Years": 2, "45 - 54 Years": 3, "55 - 64 Years": 4, "65+ Years": 5}
mapping_race = {"White": 1, "Black": 2, "Hispanic": 3, "Other or Multiple": 4}
mapping_sex = {"Male": 1, "Female": 2}
mapping_income_poverty = {"Below Poverty": 1, "<= $75,000, Above Poverty": 2, "> $75,000": 3}
mapping_marital_status = {"Married": 1, "Not Married": 2}
mapping_rent_or_own = {"Own": 1, "Rent": 2}
mapping_employment_status = {"Employed": 1, "Not in Labor Force": 2, "Unemployed": 3}
mapping_census_msa = {"Non-MSA": 1, "MSA, Not Principle City": 2, "MSA, Principle City": 3}
mapping_household_adults = {0.0: 0, 1.0: 1, 2.0: 2, 3.0: 3, np.nan: -1}
mapping_household_children = {0.0: 0, 1.0: 1, 2.0: 2, 3.0: 3, np.nan: -1}
mapping_geo = {
    'oxchjgsf': 1,
    'bhuqouqj': 2,
    'qufhixun': 3,
    'lrircsnp': 4,
    'atmpeygn': 5,
    'lzgpxyit': 6,
    'fpwskwrf': 7,
    'mlyzmhmf': 8,
    'dqpwygqj': 9,
    'kbazzjca': 10
}
mapping_employment_industry = {
    # np.nan: -1,
    'pxcmvdjn': 1,
    'rucpziij': 2,
    'wxleyezf': 3,
    'saaquncn': 4,
    'xicduogh': 5,
    'ldnlellj': 6,
    'wlfvacwt': 7,
    'nduyfdeo': 8,
    'fcxhlnwr': 9,
    'vjjrobsf': 10,
    'arjwrbjb': 11,
    'atmlpfrs': 12,
    'msuufmds': 13,
    'xqicxuve': 14,
    'phxvnwax': 15,
    'dotnnunm': 16,
    'mfikgejo': 17,
    'cfqqtusy': 18,
    'mcubkhph': 19,
    'haxffmxo': 20,
    'qnlwzans': 21
}
mapping_employment_occupation = {
    # np.nan: -1,
    'xgwztkwe': 1,
    'xtkaffoo': 2,
    'emcorrxb': 3,
    'vlluhbov': 4,
    'xqwwgdyp': 5,
    'ccgxvspp': 6,
    'qxajmpny': 7,
    'kldqjyjy': 8,
    'mxkfnird': 9,
    'hfxkjkmi': 10,
    'bxpfxfdn': 11,
    'ukymxvdu': 12,
    'cmhcxjea': 13,
    'haliazsg': 14,
    'dlvbwzss': 15,
    'xzmlyyjv': 16,
    'oijqvulv': 17,
    'rcertsgn': 18,
    'tfqavkke': 19,
    'hodpvpew': 20,
    'uqqtjvyb': 21,
    'pvmttkik': 22,
    'dcjcmpih': 23
}
mapping_education = {
    '< 12 Years': 1,
    '12 Years': 2,
    'College Graduate': 3,
    'Some College': 4
}


X_test["age_group"]=X_test.age_group.map(mapping_age_group)
X_test["race"]=X_test.race.map(mapping_race)
X_test["sex"]=X_test.sex.map(mapping_sex)
X_test["income_poverty"] = X_test["income_poverty"].map(mapping_income_poverty)
X_test["marital_status"] = X_test["marital_status"].map(mapping_marital_status)
X_test["rent_or_own"] = X_test["rent_or_own"].map(mapping_rent_or_own)
X_test["employment_status"] = X_test["employment_status"].map(mapping_employment_status)
X_test["census_msa"] = X_test["census_msa"].map(mapping_census_msa)
X_test["household_adults"] = X_test["household_adults"].map(mapping_household_adults)
X_test["household_children"] = X_test["household_children"].map(mapping_household_children)
X_test["hhs_geo_region"] = X_test["hhs_geo_region"].map(mapping_geo)
X_test["employment_industry"] = X_test["employment_industry"].map(mapping_employment_industry)
X_test["employment_occupation"] = X_test["employment_occupation"].map(mapping_employment_occupation)
X_test["education"] = X_test["education"].map(mapping_education)

X_train["age_group"]=X_train.age_group.map(mapping_age_group)
X_train["race"]=X_train.race.map(mapping_race)
X_train["sex"]=X_train.sex.map(mapping_sex)
X_train["income_poverty"] = X_train["income_poverty"].map(mapping_income_poverty)
X_train["marital_status"] = X_train["marital_status"].map(mapping_marital_status)
X_train["rent_or_own"] = X_train["rent_or_own"].map(mapping_rent_or_own)
X_train["employment_status"] = X_train["employment_status"].map(mapping_employment_status)
X_train["census_msa"] = X_train["census_msa"].map(mapping_census_msa)
X_train["household_adults"] = X_train["household_adults"].map(mapping_household_adults)
X_train["household_children"] = X_train["household_children"].map(mapping_household_children)
X_train["hhs_geo_region"] = X_train["hhs_geo_region"].map(mapping_geo)
X_train["employment_industry"] = X_train["employment_industry"].map(mapping_employment_industry)
X_train["employment_occupation"] = X_train["employment_occupation"].map(mapping_employment_occupation)
X_train["education"] = X_train["education"].map(mapping_education)



print(X_train.columns)
# print(X_train.head)
for col in X_train.columns:
    print(col, X_train[col].unique())


Index(['respondent_id', 'xyz_concern', 'xyz_knowledge',
       'behavioral_antiviral_meds', 'behavioral_avoidance',
       'behavioral_face_mask', 'behavioral_wash_hands',
       'behavioral_large_gatherings', 'behavioral_outside_home',
       'behavioral_touch_face', 'doctor_recc_xyz', 'doctor_recc_seasonal',
       'chronic_med_condition', 'child_under_6_months', 'health_worker',
       'health_insurance', 'opinion_xyz_vacc_effective', 'opinion_xyz_risk',
       'opinion_xyz_sick_from_vacc', 'opinion_seas_vacc_effective',
       'opinion_seas_risk', 'opinion_seas_sick_from_vacc', 'age_group',
       'education', 'race', 'sex', 'income_poverty', 'marital_status',
       'rent_or_own', 'employment_status', 'hhs_geo_region', 'census_msa',
       'household_adults', 'household_children', 'employment_industry',
       'employment_occupation'],
      dtype='object')
respondent_id [    0     1     2 ... 26704 26705 26706]
xyz_concern [ 1.  3.  2.  0. nan]
xyz_knowledge [ 0.  2.  1. nan]
beh

In [47]:
for column in X_train.columns:
    mode_value = X_train[column].mode()[0]
    X_train[column].fillna(mode_value, inplace=True)

# print(X_train.head)
print(X_train.isnull().sum())


respondent_id                  0
xyz_concern                    0
xyz_knowledge                  0
behavioral_antiviral_meds      0
behavioral_avoidance           0
behavioral_face_mask           0
behavioral_wash_hands          0
behavioral_large_gatherings    0
behavioral_outside_home        0
behavioral_touch_face          0
doctor_recc_xyz                0
doctor_recc_seasonal           0
chronic_med_condition          0
child_under_6_months           0
health_worker                  0
health_insurance               0
opinion_xyz_vacc_effective     0
opinion_xyz_risk               0
opinion_xyz_sick_from_vacc     0
opinion_seas_vacc_effective    0
opinion_seas_risk              0
opinion_seas_sick_from_vacc    0
age_group                      0
education                      0
race                           0
sex                            0
income_poverty                 0
marital_status                 0
rent_or_own                    0
employment_status              0
hhs_geo_re

Now we can start actually training the data


Step 3: Standardize the Features (if necessary)

Standardizing can be beneficial for certain models like logistic regression or SVM.


In [48]:

scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
# X_test = scaler.transform(X_test)

Now let's import the testing X_trains
Then, split the 2 target variable X_trains differently and standardize the nomenclature