In [2]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

In [3]:
xtrain = pd.read_csv("training_set_features.csv")
ytrain = pd.read_csv("training_set_labels.csv")
testSet = pd.read_csv("test_set_features.csv")

In [44]:
xtrain.drop(columns=["respondent_id"], inplace = True)

In [45]:
print(xtrain.shape, ytrain.shape, testSet.shape)

(26707, 35) (26707, 3) (26708, 36)


# Filter useful data

In [106]:
# cols to remove: health_insurance, employment industry and occupation

# cols to separate: doctor_recc xyz and seasonal, opinion effective/risk/sick xyz and seasonal

def separate(df):
    colsReqdxyz = ['xyz_concern', 'xyz_knowledge',
            'behavioral_antiviral_meds', 'behavioral_avoidance',
            'behavioral_face_mask', 'behavioral_wash_hands',
            'behavioral_large_gatherings', 'behavioral_outside_home',
            'behavioral_touch_face', 'doctor_recc_xyz',
            'chronic_med_condition', 'child_under_6_months', 'health_worker',
            'opinion_xyz_vacc_effective', 'opinion_xyz_risk',
            'opinion_xyz_sick_from_vacc', 'age_group',
            'education', 'race', 'sex', 'income_poverty', 'marital_status',
            'rent_or_own', 'employment_status', 'hhs_geo_region', 'census_msa',
            'household_adults', 'household_children']
    
    colsReqdseas = ['xyz_concern', 'xyz_knowledge',
            'behavioral_antiviral_meds', 'behavioral_avoidance',
            'behavioral_face_mask', 'behavioral_wash_hands',
            'behavioral_large_gatherings', 'behavioral_outside_home',
            'behavioral_touch_face', 'doctor_recc_seasonal',
            'chronic_med_condition', 'child_under_6_months', 'health_worker',
            'opinion_seas_vacc_effective','opinion_seas_risk', 'opinion_seas_sick_from_vacc',
            'age_group', 'education', 'race', 'sex', 'income_poverty', 'marital_status',
            'rent_or_own', 'employment_status', 'hhs_geo_region', 'census_msa',
            'household_adults', 'household_children']
    
    return df[colsReqdseas], df[colsReqdxyz]

xtrainseas, xtrainxyz = separate(xtrain)

ytrainseas, ytrainxyz = ytrain["seasonal_vaccine"], ytrain["xyz_vaccine"]

# Handle missing values

In [107]:
from sklearn.impute import SimpleImputer
def fillMissing(df):
    imp = SimpleImputer(strategy = "most_frequent")
    cols = df.columns
    fill = imp.fit_transform(df)
    return pd.DataFrame(fill, columns = cols, index = df.index)

xtrainxyz = fillMissing(xtrainxyz)
xtrainseas = fillMissing(xtrainseas)

# Encoding

In [108]:
# do one hot encoding for all
from sklearn.preprocessing import OneHotEncoder

def encode(df):
    toEncode = ['age_group','education','race','sex','income_poverty','marital_status','rent_or_own','employment_status','hhs_geo_region','census_msa']

    # one hot encoder
    encoder = OneHotEncoder()
    for i in toEncode:
        result = encoder.fit_transform(df[[i]])
        df = df.join(pd.DataFrame(result.toarray(), columns = encoder.categories_))
    
    # drop the pre-encoded cols
    df.drop(columns = toEncode, inplace = True)

    # convert the tuple type column names to string
    df.columns = [x[0] if type(x) == tuple else x for x in df.columns]

    # if dtype is still object, make it int
    for col in df.select_dtypes(include = ['object']).columns:
        df[col] = df[col].astype(int)
    
    return df

xtrainxyz = encode(xtrainxyz)
xtrainseas = encode(xtrainseas)

# Fitting models

In [109]:
from sklearn.model_selection import train_test_split

x = xtrainxyz
y = ytrainxyz
trainX1, testX1, trainY1, testY1 = train_test_split(x, y, test_size = 0.2)
x = xtrainseas
y = ytrainseas
trainX2, testX2, trainY2, testY2 = train_test_split(x, y, test_size = 0.2)

# DECISION TREE

In [110]:
from sklearn.tree import DecisionTreeClassifier

clf = DecisionTreeClassifier()

clf.fit(trainX1, trainY1)
clf.score(testX1, testY1)

0.7461624859603145

In [111]:
clf2 = DecisionTreeClassifier()

clf2.fit(trainX2, trainY2)
clf2.score(testX2, testY2)

0.6860726319730438

# LOGISTIC REGRESSION

In [112]:
from sklearn.linear_model import LogisticRegression

modelxyz = LogisticRegression()
modelxyz.fit(trainX1, trainY1)
modelxyz.score(testX1, testY1)

0.8339573193560464

In [113]:
modelseas = LogisticRegression()
modelseas.fit(trainX2, trainY2)
modelseas.score(testX2, testY2)

0.7742418569824036

# GRADIENT BOOST

In [118]:
from sklearn.ensemble import GradientBoostingClassifier

gbc = GradientBoostingClassifier(learning_rate=0.1, n_estimators=200, max_depth=5)
gbc.fit(trainX1, trainY1)
gbc.score(testX1, testY1)

0.8354548858105578

In [117]:
gbc2 = GradientBoostingClassifier(learning_rate=0.1, n_estimators=200, max_depth=5)
gbc2.fit(trainX2, trainY2)
gbc2.score(testX2, testY2)

0.7798577311868214

# USING GRADIENT BOOST TO GET RESULTS

In [87]:
# filter useful cols
testSeasonal, testXYZ = separate(testSet)

# handle missing values
testSeasonal = fillMissing(testSeasonal)
testXYZ = fillMissing(testXYZ)

# encode text cols
testSeasonal = encode(testSeasonal)
testXYZ = encode(testXYZ)

In [90]:
h1n1 = gbc.predict(testXYZ)
seasonal = gbc2.predict(testSeasonal)

result = pd.DataFrame( {
    "h1n1_vaccine" : h1n1,
    "seasonal_vaccine" : seasonal
} ,index = [i for i in range(26707,53415)])

result

Unnamed: 0,h1n1_vaccine,seasonal_vaccine
26707,0,0
26708,0,0
26709,1,1
26710,1,1
26711,0,0
...,...,...
53410,0,0
53411,0,0
53412,0,0
53413,0,0


In [91]:
result.to_csv("result.csv")