## Pre-Processing Functions ##

In [1]:
def data_cleaning(df):
    #deletes columns with more than 40% null values
    #uses the most frequent imputation strategy for numeric and
    #string data
    i = 0
    while (i<len(df.columns)):
        current_col = df.iloc[:, i]
        if ((current_col.isnull().sum()/26708) > 0.4): 
            df.drop(df.columns[[i]], axis=1, inplace=True)
        if (current_col.dtypes == 'float64'):
            mode = current_col.mode()[0]
            current_col.fillna(mode, inplace=True)
        if (current_col.dtypes == 'object'):
            mode = current_col.mode()[0]
            current_col.fillna(mode, inplace=True)
        i = i + 1
    return df

In [2]:
def label_encoding(df):
        age_map = {
        '18 - 34 Years':1,
        '35 - 44 Years':2,
        '45 - 54 Years':3,
        '55 - 64 Years':4,
        '65+ Years':5
        }

        education_map = {
            '< 12 Years':1,
            '12 Years':2,
            'Some College':3,
            'College Graduate':4
        }

        income_map = {
            'Below Poverty':1,
            '<= $75,000, Above Poverty':2,
            '> $75,000':3
        }

        df['age_group_numerical'] = df.age_group.map(age_map)
        df['education_numerical'] = df.education.map(education_map)
        df['income_poverty_numerical'] = df.income_poverty.map(income_map)
        return df.drop(columns=['age_group','education','income_poverty'])

In [3]:
#This function takes in a dataframe (df),
#then calls the label_encoding function on it,
#then calls the get_dummies function on it,
#and returns an updated dataframe (df2).
def data_preprocessing(df):
    df2 = pd.get_dummies(label_encoding(df), prefix_sep='_', drop_first=True)
    return df2

### Importing and Preprocessing the Data ###

In [4]:
import pandas as pd

#Training features
X_train = pd.read_csv(
    "training_set_features.csv"
)
#Testing features
X_test = pd.read_csv(
    "test_set_features.csv"
)
#Training labels
y_train = pd.read_csv(
    "training_set_labels.csv"
)
#Submission format
submission_format = pd.read_csv(
    "submission_format.csv"
)

In [5]:
preprocessed_X_train = data_preprocessing(X_train)
preprocessed_X_test = data_preprocessing(X_test)

In [6]:
### XGBoost Regression ###

In [16]:
import xgboost as xgb
from xgboost import XGBRegressor
model = XGBRegressor(
    max_depth = 3,
    eta = 0.01
)

In [17]:
model.fit(preprocessed_X_train, y_train[['h1n1_vaccine']])

XGBRegressor(base_score=0.5, booster='gbtree', colsample_bylevel=1,
             colsample_bynode=1, colsample_bytree=1, eta=0.01, gamma=0,
             gpu_id=-1, importance_type='gain', interaction_constraints='',
             learning_rate=0.00999999978, max_delta_step=0, max_depth=3,
             min_child_weight=1, missing=nan, monotone_constraints='()',
             n_estimators=100, n_jobs=4, num_parallel_tree=1, random_state=0,
             reg_alpha=0, reg_lambda=1, scale_pos_weight=1, subsample=1,
             tree_method='exact', validate_parameters=1, verbosity=None)

In [18]:
h1n1_pred = model.predict(preprocessed_X_test)

In [19]:
submission_format["h1n1_vaccine"] = h1n1_pred

In [20]:
model.fit(preprocessed_X_train, y_train[['seasonal_vaccine']])

XGBRegressor(base_score=0.5, booster='gbtree', colsample_bylevel=1,
             colsample_bynode=1, colsample_bytree=1, eta=0.01, gamma=0,
             gpu_id=-1, importance_type='gain', interaction_constraints='',
             learning_rate=0.00999999978, max_delta_step=0, max_depth=3,
             min_child_weight=1, missing=nan, monotone_constraints='()',
             n_estimators=100, n_jobs=4, num_parallel_tree=1, random_state=0,
             reg_alpha=0, reg_lambda=1, scale_pos_weight=1, subsample=1,
             tree_method='exact', validate_parameters=1, verbosity=None)

In [21]:
seas_pred = model.predict(preprocessed_X_test)

In [22]:
submission_format["seasonal_vaccine"] = seas_pred

### Final Submission ###

In [23]:
final_submission = submission_format.set_index('respondent_id')
final_submission

Unnamed: 0_level_0,h1n1_vaccine,seasonal_vaccine
respondent_id,Unnamed: 1_level_1,Unnamed: 2_level_1
26707,0.346285,0.384336
26708,0.213432,0.270644
26709,0.282761,0.632378
26710,0.510047,0.669115
26711,0.398622,0.445957
...,...,...
53410,0.382347,0.520957
53411,0.245475,0.403277
53412,0.245773,0.413633
53413,0.213948,0.458274


In [24]:
final_submission.to_csv('Final_submission.csv', index=True)