In [3]:
import pandas as pd
import numpy as np 
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import OrdinalEncoder

#### Loading Datasets

In [4]:
Ftrs_Train=pd.read_csv("training_set_features.csv",index_col=[0])
Labels_Train=pd.read_csv("training_set_labels.csv",index_col=[0])
Ftrs_Test=pd.read_csv("test_set_features.csv",index_col=[0])

#### Preprocessing Data

In [5]:
#Dropping the columns as they have over 12,000 NaN values
Ftrs_Train.drop(columns=['employment_occupation','employment_industry'],inplace=True)
Ftrs_Test.drop(columns=['employment_occupation','employment_industry'],inplace=True)

#Filling NaN values with most frequent class
Ftrs_Train=Ftrs_Train.apply(lambda x: x.fillna(x.value_counts().index[0]))
Ftrs_Test=Ftrs_Test.apply(lambda x: x.fillna(x.value_counts().index[0]))


object_column=['age_group',
       'education', 'race', 'sex', 'income_poverty', 'marital_status',
       'rent_or_own', 'employment_status', 'hhs_geo_region','census_msa']

#Encoding Categorial columns
for column in object_column:
    encoder=OrdinalEncoder()
    Ftrs_Train[column]=encoder.fit_transform(Ftrs_Train[[column]])
    Ftrs_Test[column]=encoder.fit_transform(Ftrs_Test[[column]])

#### Using **binary relevance** for multi-label Classification
Considering Target labels to be independent of each other

In [7]:
#Predicting Probability for h1n1 vaccines
logreg=LogisticRegression(max_iter=1000,solver='lbfgs')
logreg.fit(Ftrs_Train,Labels_Train.iloc[:,0])

#Choosing probability of the positive class
prob_h1n1=logreg.predict_proba(Ftrs_Test)[:,1]


#Predicting Probability for Seasonal vaccines
logreg.fit(Ftrs_Train,Labels_Train.iloc[:,1])

#Choosing probability of the positive class
prob_seasonal=logreg.predict_proba(Ftrs_Test)[:,1]

In [11]:
result=pd.DataFrame(prob_h1n1,columns=["h1n1_vaccine"])

result["seasonal vaccine"]=pd.Series(prob_seasonal)

result.to_csv("results.csv",index_label="respondent_id")
result.head(10)

Unnamed: 0,h1n1_vaccine,seasonal vaccine
0,0.073829,0.211342
1,0.041694,0.034216
2,0.445388,0.693182
3,0.484228,0.879726
4,0.224462,0.49301
5,0.409471,0.911629
6,0.349004,0.504131
7,0.15704,0.3156
8,0.037369,0.175334
9,0.191272,0.856585
