In [2]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

In [93]:
xtrain = pd.read_csv("training_set_features.csv")
ytrain = pd.read_csv("training_set_labels.csv")
testSet = pd.read_csv("test_set_features.csv")

In [96]:
print(xtrain.shape, ytrain.shape, testSet.shape)

(26707, 36) (26707, 3) (26708, 36)


# Filter useful data

In [43]:
# cols to remove: health_insurance, employment industry and occupation

# cols to separate: doctor_recc xyz and seasonal, opinion effective/risk/sick xyz and seasonal

def separate(df):
    colsReqdxyz = ['respondent_id', 'xyz_concern', 'xyz_knowledge',
        'behavioral_antiviral_meds', 'behavioral_avoidance',
        'behavioral_face_mask', 'behavioral_wash_hands',
        'behavioral_large_gatherings', 'behavioral_outside_home',
        'behavioral_touch_face', 'doctor_recc_xyz',
        'chronic_med_condition', 'child_under_6_months', 'health_worker',
        'opinion_xyz_vacc_effective', 'opinion_xyz_risk',
        'opinion_xyz_sick_from_vacc', 'age_group',
        'education', 'race', 'sex', 'income_poverty', 'marital_status',
        'rent_or_own', 'employment_status', 'hhs_geo_region', 'census_msa',
        'household_adults', 'household_children']
    colsReqdseas = ['respondent_id', 'xyz_concern', 'xyz_knowledge',
        'behavioral_antiviral_meds', 'behavioral_avoidance',
        'behavioral_face_mask', 'behavioral_wash_hands',
        'behavioral_large_gatherings', 'behavioral_outside_home',
        'behavioral_touch_face', 'doctor_recc_seasonal',
        'chronic_med_condition', 'child_under_6_months', 'health_worker',
        'opinion_seas_vacc_effective','opinion_seas_risk', 'opinion_seas_sick_from_vacc',
        'age_group', 'education', 'race', 'sex', 'income_poverty', 'marital_status',
        'rent_or_own', 'employment_status', 'hhs_geo_region', 'census_msa',
        'household_adults', 'household_children']
    
    return df[colsReqdseas], df[colsReqdxyz]

xtrainseas, xtrainxyz = separate(xtrain)

ytrainseas, ytrainxyz = ytrain["seasonal_vaccine"], ytrain["xyz_vaccine"]

# Handle missing values

In [44]:
from sklearn.impute import SimpleImputer
def fillMissing(df):
    imp = SimpleImputer(strategy = "most_frequent")
    cols = df.columns
    fill = imp.fit_transform(df)
    return pd.DataFrame(fill, columns = cols, index = df.index)

xtrainxyz = fillMissing(xtrainxyz)
xtrainseas = fillMissing(xtrainseas)

# Encoding

In [45]:
# do one hot encoding for all
from sklearn.preprocessing import OneHotEncoder

def encode(df):
    toEncode = ['age_group','education','race','sex','income_poverty','marital_status','rent_or_own','employment_status','hhs_geo_region','census_msa']

    # one hot encoder
    encoder = OneHotEncoder()
    for i in toEncode:
        result = encoder.fit_transform(df[[i]])
        df = df.join(pd.DataFrame(result.toarray(), columns = encoder.categories_))
    
    # drop the pre-encoded cols
    df.drop(columns = toEncode, inplace = True)

    # convert the tuple type column names to string
    df.columns = [x[0] if type(x) == tuple else x for x in df.columns]

    # if dtype is still object, make it int
    for col in df.select_dtypes(include = ['object']).columns:
        df[col] = df[col].astype(int)
    
    return df

xtrainxyz = encode(xtrainxyz)
xtrainseas = encode(xtrainseas)

# Fitting models

In [90]:
from sklearn.model_selection import train_test_split

x = xtrainxyz
y = ytrainxyz
trainX1, testX1, trainY1, testY1 = train_test_split(x, y, test_size = 0.2)
x = xtrainseas
y = ytrainseas
trainX2, testX2, trainY2, testY2 = train_test_split(x, y, test_size = 0.2)

In [91]:
from sklearn.naive_bayes import MultinomialNB

modelxyz = MultinomialNB()
modelxyz.fit(trainX1, trainY1)
modelxyz.score(testX1, testY1)

0.7162111568700861

In [92]:
modelseas = MultinomialNB()
modelseas.fit(trainX2, trainY2)
modelseas.score(testX2, testY2)

0.6666042680643953

# Getting results

In [102]:
# filter useful cols
testSeasonal, testXYZ = separate(testSet)

# handle missing values
testSeasonal = fillMissing(testSeasonal)
testXYZ = fillMissing(testXYZ)

# encode text cols
testSeasonal = encode(testSeasonal)
testXYZ = encode(testXYZ)

In [110]:
h1n1 = modelxyz.predict(testXYZ)
seasonal = modelseas.predict(testSeasonal)

result = pd.DataFrame( {
    "h1n1_vaccine" : h1n1,
    "seasonal_vaccine" : seasonal
} ,index = testSeasonal["respondent_id"])

result

Unnamed: 0_level_0,h1n1_vaccine,seasonal_vaccine
respondent_id,Unnamed: 1_level_1,Unnamed: 2_level_1
26707,0,0
26708,0,0
26709,0,0
26710,0,0
26711,0,0
...,...,...
53410,0,0
53411,0,0
53412,0,0
53413,0,0


In [112]:
result.to_csv("result.csv")