In [13]:
import pandas as pd
import numpy as np
from IPython.display import display
from sklearn.model_selection import train_test_split
from sklearn.multioutput import MultiOutputClassifier
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier, HistGradientBoostingClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, classification_report
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import LabelEncoder

In [2]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [3]:
training_features = pd.read_csv('/content/drive/MyDrive/FluShot/training_set_features.csv')
training_labels = pd.read_csv('/content/drive/MyDrive/FluShot/training_set_labels.csv')

In [4]:
print(training_features.columns.tolist())
print(training_labels.columns.tolist())

['respondent_id', 'h1n1_concern', 'h1n1_knowledge', 'behavioral_antiviral_meds', 'behavioral_avoidance', 'behavioral_face_mask', 'behavioral_wash_hands', 'behavioral_large_gatherings', 'behavioral_outside_home', 'behavioral_touch_face', 'doctor_recc_h1n1', 'doctor_recc_seasonal', 'chronic_med_condition', 'child_under_6_months', 'health_worker', 'health_insurance', 'opinion_h1n1_vacc_effective', 'opinion_h1n1_risk', 'opinion_h1n1_sick_from_vacc', 'opinion_seas_vacc_effective', 'opinion_seas_risk', 'opinion_seas_sick_from_vacc', 'age_group', 'education', 'race', 'sex', 'income_poverty', 'marital_status', 'rent_or_own', 'employment_status', 'hhs_geo_region', 'census_msa', 'household_adults', 'household_children', 'employment_industry', 'employment_occupation']
['respondent_id', 'h1n1_vaccine', 'seasonal_vaccine']


In [5]:
training_features.describe()

Unnamed: 0,respondent_id,h1n1_concern,h1n1_knowledge,behavioral_antiviral_meds,behavioral_avoidance,behavioral_face_mask,behavioral_wash_hands,behavioral_large_gatherings,behavioral_outside_home,behavioral_touch_face,...,health_worker,health_insurance,opinion_h1n1_vacc_effective,opinion_h1n1_risk,opinion_h1n1_sick_from_vacc,opinion_seas_vacc_effective,opinion_seas_risk,opinion_seas_sick_from_vacc,household_adults,household_children
count,26707.0,26615.0,26591.0,26636.0,26499.0,26688.0,26665.0,26620.0,26625.0,26579.0,...,25903.0,14433.0,26316.0,26319.0,26312.0,26245.0,26193.0,26170.0,26458.0,26458.0
mean,13353.0,1.618486,1.262532,0.048844,0.725612,0.068982,0.825614,0.35864,0.337315,0.677264,...,0.111918,0.87972,3.850623,2.342566,2.35767,4.025986,2.719162,2.118112,0.886499,0.534583
std,7709.791156,0.910311,0.618149,0.215545,0.446214,0.253429,0.379448,0.47961,0.472802,0.467531,...,0.315271,0.3253,1.007436,1.285539,1.362766,1.086565,1.385055,1.33295,0.753422,0.928173
min,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,1.0,1.0,1.0,1.0,1.0,1.0,0.0,0.0
25%,6676.5,1.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,...,0.0,1.0,3.0,1.0,1.0,4.0,2.0,1.0,0.0,0.0
50%,13353.0,2.0,1.0,0.0,1.0,0.0,1.0,0.0,0.0,1.0,...,0.0,1.0,4.0,2.0,2.0,4.0,2.0,2.0,1.0,0.0
75%,20029.5,2.0,2.0,0.0,1.0,0.0,1.0,1.0,1.0,1.0,...,0.0,1.0,5.0,4.0,4.0,5.0,4.0,4.0,1.0,1.0
max,26706.0,3.0,2.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,...,1.0,1.0,5.0,5.0,5.0,5.0,5.0,5.0,3.0,3.0


In [6]:
training_features = training_features.drop("employment_industry", axis=1)
training_features = training_features.drop("employment_occupation", axis=1)
training_features = training_features.drop("hhs_geo_region", axis=1)

print(training_features.columns)


Index(['respondent_id', 'h1n1_concern', 'h1n1_knowledge',
       'behavioral_antiviral_meds', 'behavioral_avoidance',
       'behavioral_face_mask', 'behavioral_wash_hands',
       'behavioral_large_gatherings', 'behavioral_outside_home',
       'behavioral_touch_face', 'doctor_recc_h1n1', 'doctor_recc_seasonal',
       'chronic_med_condition', 'child_under_6_months', 'health_worker',
       'health_insurance', 'opinion_h1n1_vacc_effective', 'opinion_h1n1_risk',
       'opinion_h1n1_sick_from_vacc', 'opinion_seas_vacc_effective',
       'opinion_seas_risk', 'opinion_seas_sick_from_vacc', 'age_group',
       'education', 'race', 'sex', 'income_poverty', 'marital_status',
       'rent_or_own', 'employment_status', 'census_msa', 'household_adults',
       'household_children'],
      dtype='object')


In [7]:
merged_df = pd.merge(training_features, training_labels, on='respondent_id')


categorical_features = ['age_group', 'education', 'race', 'sex', 'income_poverty', 'marital_status', 'rent_or_own', 'employment_status', 'census_msa']
for feature in categorical_features:
    encoder = LabelEncoder()
    merged_df[feature] = encoder.fit_transform(merged_df[feature])


In [8]:
merged_df.describe()

Unnamed: 0,respondent_id,h1n1_concern,h1n1_knowledge,behavioral_antiviral_meds,behavioral_avoidance,behavioral_face_mask,behavioral_wash_hands,behavioral_large_gatherings,behavioral_outside_home,behavioral_touch_face,...,sex,income_poverty,marital_status,rent_or_own,employment_status,census_msa,household_adults,household_children,h1n1_vaccine,seasonal_vaccine
count,26707.0,26615.0,26591.0,26636.0,26499.0,26688.0,26665.0,26620.0,26625.0,26579.0,...,26707.0,26707.0,26707.0,26707.0,26707.0,26707.0,26458.0,26458.0,26707.0,26707.0
mean,13353.0,1.618486,1.262532,0.048844,0.725612,0.068982,0.825614,0.35864,0.337315,0.677264,...,0.406223,0.953795,0.545175,0.37492,0.656232,0.833489,0.886499,0.534583,0.212454,0.465608
std,7709.791156,0.910311,0.618149,0.215545,0.446214,0.253429,0.379448,0.47961,0.472802,0.467531,...,0.491136,1.113444,0.594485,0.622325,0.814312,0.823313,0.753422,0.928173,0.409052,0.498825
min,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,6676.5,1.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
50%,13353.0,2.0,1.0,0.0,1.0,0.0,1.0,0.0,0.0,1.0,...,0.0,1.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0
75%,20029.5,2.0,2.0,0.0,1.0,0.0,1.0,1.0,1.0,1.0,...,1.0,2.0,1.0,1.0,1.0,2.0,1.0,1.0,0.0,1.0
max,26706.0,3.0,2.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,...,1.0,3.0,2.0,2.0,3.0,2.0,3.0,3.0,1.0,1.0


In [None]:
merged_df = merged_df.dropna()

In [None]:
merged_df.describe()

Unnamed: 0,respondent_id,h1n1_concern,h1n1_knowledge,behavioral_antiviral_meds,behavioral_avoidance,behavioral_face_mask,behavioral_wash_hands,behavioral_large_gatherings,behavioral_outside_home,behavioral_touch_face,...,sex,income_poverty,marital_status,rent_or_own,employment_status,census_msa,household_adults,household_children,h1n1_vaccine,seasonal_vaccine
count,13506.0,13506.0,13506.0,13506.0,13506.0,13506.0,13506.0,13506.0,13506.0,13506.0,...,13506.0,13506.0,13506.0,13506.0,13506.0,13506.0,13506.0,13506.0,13506.0,13506.0
mean,13381.26181,1.52895,1.255738,0.053532,0.725159,0.068488,0.816896,0.336221,0.318747,0.67955,...,0.402118,0.825855,0.476973,0.290093,0.538724,0.837258,0.906264,0.503998,0.29846,0.503628
std,7719.366832,0.894559,0.611544,0.2251,0.446451,0.252591,0.386766,0.472433,0.466008,0.466667,...,0.490344,1.019122,0.509033,0.509184,0.639224,0.826097,0.776522,0.904973,0.457599,0.500005
min,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,6702.5,1.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
50%,13355.5,2.0,1.0,0.0,1.0,0.0,1.0,0.0,0.0,1.0,...,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,1.0
75%,20084.5,2.0,2.0,0.0,1.0,0.0,1.0,1.0,1.0,1.0,...,1.0,1.0,1.0,1.0,1.0,2.0,1.0,1.0,1.0,1.0
max,26706.0,3.0,2.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,...,1.0,3.0,2.0,2.0,3.0,2.0,3.0,3.0,1.0,1.0


In [9]:
X = merged_df.drop(['respondent_id', 'h1n1_vaccine', 'seasonal_vaccine'], axis=1)
y = merged_df[['h1n1_vaccine', 'seasonal_vaccine']]
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


# Random Forest


In [10]:
base_classifier = RandomForestClassifier()
classifier = MultiOutputClassifier(base_classifier)

In [11]:
classifier.fit(X_train, y_train)

ValueError: ignored

In [None]:
y_pred = classifier.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
report = classification_report(y_test, y_pred)

print("Accuracy:", accuracy)
print("Classification Report:\n", report)

Accuracy: 0.6813471502590673
Classification Report:
               precision    recall  f1-score   support

           0       0.76      0.58      0.66       779
           1       0.78      0.79      0.79      1322

   micro avg       0.78      0.71      0.74      2101
   macro avg       0.77      0.69      0.72      2101
weighted avg       0.78      0.71      0.74      2101
 samples avg       0.39      0.38      0.38      2101



  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


# Logistic Regression


In [None]:
model = MultiOutputClassifier(LogisticRegression())
model.fit(X_train, y_train)

y_pred = model.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
report = classification_report(y_test, y_pred)

print("Accuracy:", accuracy)
print("Classification Report:\n", report)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


Accuracy: 0.6935603256846781
Classification Report:
               precision    recall  f1-score   support

           0       0.76      0.60      0.67       779
           1       0.79      0.79      0.79      1322

   micro avg       0.78      0.72      0.75      2101
   macro avg       0.78      0.70      0.73      2101
weighted avg       0.78      0.72      0.75      2101
 samples avg       0.40      0.38      0.38      2101



  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


# Gradient Boosting


In [14]:
model = MultiOutputClassifier(HistGradientBoostingClassifier())
model.fit(X_train, y_train)

y_pred = model.predict(X)
accuracy = accuracy_score(y, y_pred)
report = classification_report(y, y_pred)

print("Accuracy:", accuracy)
print("Classification Report:\n", report)

Accuracy: 0.7184258808552065
Classification Report:
               precision    recall  f1-score   support

           0       0.78      0.53      0.63      5674
           1       0.80      0.78      0.79     12435

   micro avg       0.79      0.70      0.75     18109
   macro avg       0.79      0.66      0.71     18109
weighted avg       0.79      0.70      0.74     18109
 samples avg       0.37      0.35      0.35     18109



  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


In [16]:
respondent_id = merged_df.respondent_id.tolist()
resp = []
for i in range(len(respondent_id)):
  resp.append([respondent_id[i],y_pred[i][0],y_pred[i][1]])

In [22]:
print(np.array(resp).shape)


(26707, 3)


In [20]:
import csv
fields = ['respondent_id', 'h1n1_vaccine', 'seasonal_vaccine']
with open('/content/drive/MyDrive/FluShot/results.csv', 'w') as f:
  write = csv.writer(f)
  write.writerow(fields)
  write.writerows(resp)