In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.preprocessing import StandardScaler
from imblearn.over_sampling import RandomOverSampler
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import classification_report
from sklearn.naive_bayes import GaussianNB
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
import copy
import seaborn as sns

submission_format = pd.read_csv('submission_format.csv')
test_set_features = pd.read_csv('test_set_features.csv')
training_set_features = pd.read_csv('training_set_features.csv')
training_set_labels = pd.read_csv('training_set_labels.csv')

In [2]:
pd.options.display.max_columns = None

In [3]:
training_set_features.select_dtypes(include='object')

Unnamed: 0,age_group,education,race,sex,income_poverty,marital_status,rent_or_own,employment_status,hhs_geo_region,census_msa,employment_industry,employment_occupation
0,55 - 64 Years,< 12 Years,White,Female,Below Poverty,Not Married,Own,Not in Labor Force,oxchjgsf,Non-MSA,,
1,35 - 44 Years,12 Years,White,Male,Below Poverty,Not Married,Rent,Employed,bhuqouqj,"MSA, Not Principle City",pxcmvdjn,xgwztkwe
2,18 - 34 Years,College Graduate,White,Male,"<= $75,000, Above Poverty",Not Married,Own,Employed,qufhixun,"MSA, Not Principle City",rucpziij,xtkaffoo
3,65+ Years,12 Years,White,Female,Below Poverty,Not Married,Rent,Not in Labor Force,lrircsnp,"MSA, Principle City",,
4,45 - 54 Years,Some College,White,Female,"<= $75,000, Above Poverty",Married,Own,Employed,qufhixun,"MSA, Not Principle City",wxleyezf,emcorrxb
...,...,...,...,...,...,...,...,...,...,...,...,...
26702,65+ Years,Some College,White,Female,"<= $75,000, Above Poverty",Not Married,Own,Not in Labor Force,qufhixun,Non-MSA,,
26703,18 - 34 Years,College Graduate,White,Male,"<= $75,000, Above Poverty",Not Married,Rent,Employed,lzgpxyit,"MSA, Principle City",fcxhlnwr,cmhcxjea
26704,55 - 64 Years,Some College,White,Female,,Not Married,Own,,lzgpxyit,"MSA, Not Principle City",,
26705,18 - 34 Years,Some College,Hispanic,Female,"<= $75,000, Above Poverty",Married,Rent,Employed,lrircsnp,Non-MSA,fcxhlnwr,haliazsg


In [4]:
ordinal_cols = ['age_group', 'education', 'income_poverty', 'rent_or_own', 'employment_status', 'census_msa']
nominal_cols = ['race', 'sex', 'marital_status', 'hhs_geo_region', 'employment_industry', 'employment_occupation']
numerical_cols = training_set_features.select_dtypes(exclude='object').columns

In [77]:
from sklearn.impute import KNNImputer
knn = KNNImputer()
num_training_set = knn.fit_transform(training_set_features[numerical_cols])
num_training_set_mod =  pd.DataFrame(num_training_set, columns=numerical_cols).apply(round).astype(int)

In [17]:
num_training_set_mod.shape

(26707, 24)

In [16]:
num_training_set_mod.isna().sum()

respondent_id                  0
xyz_concern                    0
xyz_knowledge                  0
behavioral_antiviral_meds      0
behavioral_avoidance           0
behavioral_face_mask           0
behavioral_wash_hands          0
behavioral_large_gatherings    0
behavioral_outside_home        0
behavioral_touch_face          0
doctor_recc_xyz                0
doctor_recc_seasonal           0
chronic_med_condition          0
child_under_6_months           0
health_worker                  0
health_insurance               0
opinion_xyz_vacc_effective     0
opinion_xyz_risk               0
opinion_xyz_sick_from_vacc     0
opinion_seas_vacc_effective    0
opinion_seas_risk              0
opinion_seas_sick_from_vacc    0
household_adults               0
household_children             0
dtype: int64

In [5]:
from sklearn.impute import SimpleImputer
mode = SimpleImputer(strategy='most_frequent')
str_training_set = mode.fit_transform(training_set_features[ordinal_cols+nominal_cols])
str_training_set_mod = pd.DataFrame(str_training_set, columns=ordinal_cols+nominal_cols)

In [18]:
mod_training_set = pd.concat([num_training_set_mod,str_training_set_mod], axis=1)

In [20]:
mod_training_set.columns

Index(['respondent_id', 'xyz_concern', 'xyz_knowledge',
       'behavioral_antiviral_meds', 'behavioral_avoidance',
       'behavioral_face_mask', 'behavioral_wash_hands',
       'behavioral_large_gatherings', 'behavioral_outside_home',
       'behavioral_touch_face', 'doctor_recc_xyz', 'doctor_recc_seasonal',
       'chronic_med_condition', 'child_under_6_months', 'health_worker',
       'health_insurance', 'opinion_xyz_vacc_effective', 'opinion_xyz_risk',
       'opinion_xyz_sick_from_vacc', 'opinion_seas_vacc_effective',
       'opinion_seas_risk', 'opinion_seas_sick_from_vacc', 'household_adults',
       'household_children', 'age_group', 'education', 'income_poverty',
       'rent_or_own', 'employment_status', 'census_msa', 'race', 'sex',
       'marital_status', 'hhs_geo_region', 'employment_industry',
       'employment_occupation'],
      dtype='object')

In [22]:
training_set = mod_training_set.drop(columns='respondent_id')

In [25]:
mod_training_set_labels = training_set_labels.drop(columns='respondent_id')

In [157]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(training_set, mod_training_set_labels, test_size=0.2)

In [158]:
X_train[ordinal_cols]

Unnamed: 0,age_group,education,income_poverty,rent_or_own,employment_status,census_msa
9377,65+ Years,12 Years,"<= $75,000, Above Poverty",Own,Not in Labor Force,"MSA, Principle City"
9098,18 - 34 Years,Some College,"> $75,000",Own,Not in Labor Force,"MSA, Not Principle City"
17392,35 - 44 Years,12 Years,"<= $75,000, Above Poverty",Own,Employed,Non-MSA
24066,18 - 34 Years,College Graduate,"> $75,000",Rent,Not in Labor Force,"MSA, Not Principle City"
14047,35 - 44 Years,College Graduate,"> $75,000",Own,Not in Labor Force,"MSA, Principle City"
...,...,...,...,...,...,...
25601,45 - 54 Years,College Graduate,"<= $75,000, Above Poverty",Own,Employed,Non-MSA
11796,65+ Years,Some College,"<= $75,000, Above Poverty",Own,Not in Labor Force,"MSA, Principle City"
9345,45 - 54 Years,Some College,"<= $75,000, Above Poverty",Own,Employed,"MSA, Not Principle City"
25857,45 - 54 Years,College Graduate,"> $75,000",Own,Employed,"MSA, Principle City"


In [159]:
training_set_features[ordinal_cols[5]].unique()

array(['Non-MSA', 'MSA, Not Principle  City', 'MSA, Principle City'],
      dtype=object)

In [160]:
category_order = [['18 - 34 Years', '35 - 44 Years', '45 - 54 Years', '55 - 64 Years', '65+ Years'], ['< 12 Years', '12 Years', 'Some College', 'College Graduate'],
                  ['Below Poverty', '<= $75,000, Above Poverty', '> $75,000'], ['Rent', 'Own'], ['Not in Labor Force', 'Unemployed', 'Employed'],
                  ['Non-MSA', 'MSA, Not Principle  City', 'MSA, Principle City']]

In [161]:
from sklearn.preprocessing import OrdinalEncoder

oe = OrdinalEncoder(categories= category_order)

oe.fit(X_train[ordinal_cols])

X_train_ordinal = oe.transform(X_train[ordinal_cols])

X_test_ordinal = oe.transform(X_test[ordinal_cols])

In [162]:
X_test_ordinal

array([[4., 2., 1., 1., 0., 1.],
       [2., 3., 2., 1., 2., 2.],
       [1., 3., 1., 1., 2., 0.],
       ...,
       [4., 1., 1., 1., 0., 1.],
       [1., 3., 1., 0., 2., 2.],
       [0., 3., 1., 1., 2., 1.]])

In [163]:
X_train_ordinal.shape

(21365, 6)

In [164]:
from sklearn.preprocessing import OneHotEncoder

ohe = OneHotEncoder()

X_train_nominal = ohe.fit_transform(X_train[nominal_cols]).toarray()

X_test_nominal = ohe.transform(X_test[nominal_cols]).toarray()

In [165]:
num_cols = X_train.select_dtypes(exclude='object').columns

In [166]:
X_train_numerical = X_train[num_cols].to_numpy()
X_test_numerical = X_test[num_cols].to_numpy()

In [167]:
X_train_new = np.hstack((X_train_numerical, X_train_nominal, X_train_ordinal))
X_test_new = np.hstack((X_test_numerical, X_test_nominal, X_test_ordinal))

In [168]:
X_test_new.shape , X_train_new.shape

((5342, 91), (21365, 91))

In [64]:
y_train.columns

Index(['xyz_vaccine', 'seasonal_vaccine'], dtype='object')

Creating the 4 different classes

In [65]:
y_train_new = (y_train['xyz_vaccine'] * 2 ) + y_train['seasonal_vaccine']

In [66]:
y_test_new = (y_test['xyz_vaccine'] * 2 ) + y_test['seasonal_vaccine']

## SVM

In [67]:
from sklearn.svm import SVC

svm_model = SVC()
svm_model = svm_model.fit(X_train_new, y_train_new)

In [68]:
y_pred = svm_model.predict(X_test_new)
print(classification_report(y_test_new, y_pred))

              precision    recall  f1-score   support

           0       0.71      0.88      0.78      2705
           1       0.64      0.50      0.56      1546
           2       0.00      0.00      0.00       176
           3       0.60      0.51      0.55       915

    accuracy                           0.68      5342
   macro avg       0.49      0.47      0.47      5342
weighted avg       0.65      0.68      0.65      5342



  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


## Logistic Regression

In [69]:
from sklearn.linear_model import LogisticRegression
lg_model = LogisticRegression()
lg_model = lg_model.fit(X_train_new, y_train_new)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


In [71]:
y_pred = lg_model.predict(X_test_new)
print(classification_report(y_test_new, y_pred))

              precision    recall  f1-score   support

           0       0.73      0.86      0.79      2705
           1       0.63      0.53      0.58      1546
           2       0.53      0.09      0.16       176
           3       0.58      0.51      0.55       915

    accuracy                           0.68      5342
   macro avg       0.62      0.50      0.52      5342
weighted avg       0.67      0.68      0.67      5342



## Naive Bayes

In [72]:
from sklearn.naive_bayes import GaussianNB
nb_model = GaussianNB()
nb_model = nb_model.fit(X_train_new, y_train_new)

In [73]:
y_pred = nb_model.predict(X_test_new)
print(classification_report(y_test_new, y_pred))

              precision    recall  f1-score   support

           0       0.72      0.43      0.54      2705
           1       0.40      0.51      0.45      1546
           2       0.06      0.14      0.08       176
           3       0.38      0.54      0.45       915

    accuracy                           0.46      5342
   macro avg       0.39      0.41      0.38      5342
weighted avg       0.55      0.46      0.48      5342



## KNN

In [74]:
from sklearn.neighbors import KNeighborsClassifier
knn_model = KNeighborsClassifier(n_neighbors=5)
knn_model.fit(X_train_new, y_train_new)

In [75]:
y_pred = knn_model.predict(X_test_new)
print(classification_report(y_test_new, y_pred))

              precision    recall  f1-score   support

           0       0.67      0.84      0.75      2705
           1       0.51      0.47      0.49      1546
           2       0.16      0.02      0.04       176
           3       0.53      0.31      0.39       915

    accuracy                           0.61      5342
   macro avg       0.47      0.41      0.42      5342
weighted avg       0.58      0.61      0.59      5342



In [184]:
# using classifier chains
from skmultilearn.problem_transform import ClassifierChain
from sklearn.naive_bayes import GaussianNB

# initialize classifier chains multi-label classifier
# with a gaussian naive bayes base classifier
classifier = ClassifierChain(GaussianNB())

# train
classifier.fit(X_train_new, y_train)

# predict
predictions = classifier.predict(X_test_new)

accuracy_score(y_test,predictions)

0.44496443279670533

In [185]:
print(classification_report(y_test, predictions))

              precision    recall  f1-score   support

           0       0.33      0.70      0.44      1136
           1       0.57      0.77      0.65      2414

   micro avg       0.46      0.75      0.57      3550
   macro avg       0.45      0.73      0.55      3550
weighted avg       0.49      0.75      0.59      3550
 samples avg       0.29      0.36      0.31      3550



  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


In [171]:
# using Label Powerset
from skmultilearn.problem_transform import LabelPowerset
from sklearn.naive_bayes import GaussianNB
from sklearn.metrics import accuracy_score

# initialize Label Powerset multi-label classifier
# with a gaussian naive bayes base classifier
lp_nb_model = LabelPowerset(GaussianNB())

# train
lp_nb_model.fit(X_train_new, y_train)

# predict
predictions = lp_nb_model.predict(X_test_new)

accuracy_score(y_test,predictions)

0.44945713216023964

In [172]:
print(classification_report(y_test, predictions))

              precision    recall  f1-score   support

           0       0.38      0.64      0.47      1136
           1       0.55      0.78      0.65      2414

   micro avg       0.49      0.73      0.59      3550
   macro avg       0.46      0.71      0.56      3550
weighted avg       0.49      0.73      0.59      3550
 samples avg       0.34      0.36      0.34      3550



  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


## Multilabel LogReg

In [187]:
from sklearn.linear_model import LogisticRegression

# using Label Powerset
from skmultilearn.problem_transform import LabelPowerset
lp_lg_model = LabelPowerset(LogisticRegression())

lp_lg_model.fit(X_train_new, y_train)

predictions = lp_lg_model.predict(X_test_new)

print('Label Powerset - ',accuracy_score(y_test,predictions))
print(classification_report(y_test, predictions))


from skmultilearn.problem_transform import ClassifierChain
cfc_lg_model = ClassifierChain(LogisticRegression())

# train
cfc_lg_model.fit(X_train_new, y_train)

# predict
predictions = cfc_lg_model.predict(X_test_new)


print('Classifier Chain - ',accuracy_score(y_test,predictions))
print(classification_report(y_test, predictions))


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


Label Powerset -  0.6866342193934856
              precision    recall  f1-score   support

           0       0.66      0.49      0.57      1136
           1       0.79      0.71      0.74      2414

   micro avg       0.75      0.64      0.69      3550
   macro avg       0.72      0.60      0.65      3550
weighted avg       0.75      0.64      0.69      3550
 samples avg       0.31      0.31      0.30      3550



  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


Classifier Chain -  0.6793335829277424
              precision    recall  f1-score   support

           0       0.69      0.44      0.54      1136
           1       0.79      0.69      0.73      2414

   micro avg       0.76      0.61      0.68      3550
   macro avg       0.74      0.56      0.63      3550
weighted avg       0.75      0.61      0.67      3550
 samples avg       0.31      0.30      0.30      3550



STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


In [194]:
from sklearn.metrics import score

ImportError: cannot import name 'score' from 'sklearn.metrics' (/Library/Frameworks/Python.framework/Versions/3.11/lib/python3.11/site-packages/sklearn/metrics/__init__.py)

In [191]:
pd.DataFrame(predictions.toarray())

Unnamed: 0,0,1
0,0.0,1.0
1,0.0,0.0
2,0.0,0.0
3,1.0,0.0
4,0.0,0.0
...,...,...
5337,0.0,0.0
5338,0.0,0.0
5339,0.0,1.0
5340,0.0,0.0


In [188]:
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC


# using Label Powerset
from skmultilearn.problem_transform import LabelPowerset
lp_svm_model = LabelPowerset(SVC())

lp_svm_model.fit(X_train_new, y_train)

predictions = lp_svm_model.predict(X_test_new)

print('Label Powerset - ',accuracy_score(y_test,predictions))
print(classification_report(y_test, predictions))


from skmultilearn.problem_transform import ClassifierChain
cfc_svm_model = ClassifierChain(SVC())

# train
cfc_svm_model.fit(X_train_new, y_train)

# predict
predictions = cfc_svm_model.predict(X_test_new)


print('Classifier Chain - ',accuracy_score(y_test,predictions))
print(classification_report(y_test, predictions))


Label Powerset -  0.6873830026207413
              precision    recall  f1-score   support

           0       0.68      0.47      0.56      1136
           1       0.80      0.69      0.74      2414

   micro avg       0.76      0.62      0.68      3550
   macro avg       0.74      0.58      0.65      3550
weighted avg       0.76      0.62      0.68      3550
 samples avg       0.30      0.30      0.29      3550



  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


Classifier Chain -  0.6810183451890678
              precision    recall  f1-score   support

           0       0.72      0.39      0.50      1136
           1       0.80      0.68      0.73      2414

   micro avg       0.78      0.59      0.67      3550
   macro avg       0.76      0.53      0.62      3550
weighted avg       0.77      0.59      0.66      3550
 samples avg       0.30      0.29      0.29      3550



  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


In [174]:
training_set_features.shape

(26707, 36)

In [175]:
26707 ** 0.5

163.42276463210382

In [180]:
y_train_np = y_train.to_numpy()

In [182]:
from skmultilearn.adapt import MLkNN

ml_knn_model = MLkNN(k=91)

# train
ml_knn_model.fit(X_train_new, y_train_np)

# predict
predictions = ml_knn_model.predict(X_test_new)

accuracy_score(y_test,predictions)

TypeError: NearestNeighbors.__init__() takes 1 positional argument but 2 were given

In [177]:
X_train_new

array([[2., 2., 0., ..., 1., 0., 2.],
       [1., 0., 0., ..., 1., 0., 1.],
       [1., 1., 0., ..., 1., 2., 0.],
       ...,
       [1., 1., 0., ..., 1., 2., 1.],
       [0., 1., 0., ..., 1., 2., 2.],
       [1., 2., 0., ..., 0., 2., 2.]])

In [153]:
accuracy_score(y_test,pd.DataFrame(predictions.toarray()))

ValueError: Found input variables with inconsistent numbers of samples: [5342, 26708]

In [169]:
y_test.shape

(5342, 2)

In [170]:
pd.DataFrame(predictions.toarray()).shape

(26708, 2)

In [144]:
X_train_new.shape

(21365, 91)

In [156]:
X_test_new.shape

(26708, 91)

In [145]:
y_test

Unnamed: 0,xyz_vaccine,seasonal_vaccine
18429,0,1
14792,0,0
1003,0,0
18643,1,1
15641,1,1
...,...,...
14587,0,0
18837,0,0
23704,0,0
21248,0,1


In [None]:
test_set_features

In [195]:
num_test_set = knn.transform(test_set_features[numerical_cols])
num_test_set_mod =  pd.DataFrame(num_test_set, columns=numerical_cols).apply(round).astype(int)

In [196]:
str_test_set = mode.transform(test_set_features[ordinal_cols+nominal_cols])
str_test_set_mod = pd.DataFrame(str_test_set, columns=ordinal_cols+nominal_cols)

In [197]:
mod_test_set = pd.concat([num_test_set_mod,str_test_set_mod], axis=1)

In [198]:
test_set = mod_test_set.drop(columns='respondent_id')

In [199]:
X_test_ordinal = oe.transform(test_set[ordinal_cols])

In [200]:
X_test_nominal = ohe.fit_transform(test_set[nominal_cols]).toarray()

In [201]:
X_test_numerical = test_set[num_cols].to_numpy()

In [202]:
X_test_set = np.hstack((X_test_numerical, X_test_nominal, X_test_ordinal))

Out of all the Models the Label Powerset Logistic Regression and SVM were better but I will Choose LogReg as it had better f1 scores

In [203]:
predictions = lp_lg_model.predict_proba(X_test_set).toarray()

In [206]:
submission = pd.DataFrame(predictions)

In [207]:
submission

Unnamed: 0,0,1
0,0.066033,0.260642
1,0.041195,0.033318
2,0.424826,0.664071
3,0.513086,0.842111
4,0.168110,0.474550
...,...,...
26703,0.312641,0.469440
26704,0.072304,0.210816
26705,0.131269,0.176911
26706,0.067073,0.463521


In [210]:
y_train

Unnamed: 0,xyz_vaccine,seasonal_vaccine
9377,0,1
9098,0,0
17392,0,1
24066,0,0
14047,0,0
...,...,...
25601,0,1
11796,0,1
9345,0,0
25857,0,0


In [209]:
answer = pd.concat([mod_test_set['respondent_id'], submission] , axis=1)

In [211]:
answer

Unnamed: 0,respondent_id,0,1
0,26707,0.066033,0.260642
1,26708,0.041195,0.033318
2,26709,0.424826,0.664071
3,26710,0.513086,0.842111
4,26711,0.168110,0.474550
...,...,...,...
26703,53410,0.312641,0.469440
26704,53411,0.072304,0.210816
26705,53412,0.131269,0.176911
26706,53413,0.067073,0.463521


In [213]:
final_submission = answer.rename(columns={0: 'h1n1_vaccine', 1: 'seasonal_vaccine'})

In [217]:
final_submission_mod = final_submission.round(4)

In [218]:
final_submission_mod.to_csv('final_submission.csv', index=False)

In [87]:
y_pred = lg_model.predict(X_test_new)

as LogReg model was a bit better than SVM

In [88]:
y_pred = lg_model.predict_proba(X_test_new)

array([0, 0, 3, ..., 0, 0, 3])

In [137]:
y_pred_prob = lg_model.predict_proba(X_test_new)

In [138]:
y_pred_prob

array([[0.71165987, 0.21283504, 0.03975982, 0.03574527],
       [0.93083313, 0.02797836, 0.03500256, 0.00618595],
       [0.28834769, 0.235699  , 0.04470597, 0.43124734],
       ...,
       [0.7537179 , 0.11552728, 0.05467867, 0.07607615],
       [0.54113636, 0.38864406, 0.02119689, 0.0490227 ],
       [0.28144834, 0.15528968, 0.15905883, 0.40420315]])

In [89]:
answer = pd.Series(y_pred)

In [92]:
submission = pd.concat([mod_test_set['respondent_id'], answer] , axis=1)

In [None]:
y_train_new = (y_train['xyz_vaccine'] * 2 ) + y_train['seasonal_vaccine']

In [97]:
submission['h1n1_vaccine'] = (submission[0] / 2).astype(int)

In [109]:
submission[submission[0] == 0]

Unnamed: 0,respondent_id,0,h1n1_vaccine
0,26707,0,0
1,26708,0,0
4,26711,0,0
6,26713,0,0
7,26714,0,0
...,...,...,...
26702,53409,0,0
26703,53410,0,0
26704,53411,0,0
26705,53412,0,0


In [117]:
submission[submission[0] == (1 or 3)]

Unnamed: 0,respondent_id,0,h1n1_vaccine
5,26712,1,0
9,26716,1,0
12,26719,1,0
17,26724,1,0
20,26727,1,0
...,...,...,...
26690,53397,1,0
26691,53398,1,0
26694,53401,1,0
26697,53404,1,0


In [130]:
submission['seasonal_vaccine'] = ((submission[0] == 1) | (submission[0] == 3)).astype(int)

In [131]:
submission[submission[0] == 3]

Unnamed: 0,respondent_id,0,h1n1_vaccine,seasonal_vaccine
2,26709,3,1,1
3,26710,3,1,1
16,26723,3,1,1
26,26733,3,1,1
27,26734,3,1,1
...,...,...,...,...
26679,53386,3,1,1
26682,53389,3,1,1
26684,53391,3,1,1
26687,53394,3,1,1


In [133]:
submission.drop(columns=0, inplace=True)

In [134]:
submission

Unnamed: 0,respondent_id,h1n1_vaccine,seasonal_vaccine
0,26707,0,0
1,26708,0,0
2,26709,1,1
3,26710,1,1
4,26711,0,0
...,...,...,...
26703,53410,0,0
26704,53411,0,0
26705,53412,0,0
26706,53413,0,0


In [136]:
submission.to_csv('submission.csv', index=False)