In [1]:
import pandas as pd
import numpy as np
import seaborn as sns #visualisation
import matplotlib.pyplot as plt #visualisation
from sklearn.preprocessing import OneHotEncoder, scale, StandardScaler, MinMaxScaler
from sklearn.naive_bayes import GaussianNB
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, confusion_matrix, average_precision_score,classification_report
from sklearn.ensemble import RandomForestClassifier
import joblib
from sklearn.model_selection import GridSearchCV
from sklearn.decomposition import PCA
from tensorflow import keras
from tensorflow.keras import backend as K
from tensorflow.keras.models import Sequential, Model
from tensorflow.keras.layers import Dense, Dropout, Activation, Flatten, Conv2D, MaxPooling2D, Bidirectional, LSTM, Reshape,TimeDistributed, GRU, concatenate, add,Input
import pickle
import tensorflow as tf
from sklearn.model_selection import cross_val_score



In [2]:
# import data
df_training = pd.read_csv("training_set_features.csv")
df_labels = pd.read_csv("training_set_labels.csv")
df = df_labels.merge(df_training, on = "respondent_id", how = "inner")

In [3]:
# Clean feature and label dataset
df1 = df.drop(["health_insurance","employment_industry", "employment_occupation"], axis=1)
df2 = df1.dropna()

In [4]:
# Clean label dataset
df_y = df2.select_dtypes(include=['int64'])
df_y = df_y[["respondent_id","h1n1_vaccine"]]
df_y = np.array(df_y)

In [5]:
# Keep original features to the side 
df_original_features = df2.drop(["h1n1_vaccine", "seasonal_vaccine"], axis=1)

In [7]:
# Clean features dataset

# drop label columns
df_x = df2.drop(["h1n1_vaccine", "seasonal_vaccine"], axis=1)

# pull out respondent id, categorical string, and categorical number features
df_int = df_x.select_dtypes(include=["int64"])
df_categories = df_x.select_dtypes(include=['object'])
df_float = df_x.select_dtypes(include=['float64'])

# turn the above df's into arrays
int_array = np.array(df_int)
features_array = np.array(df_categories)
float_array = np.array(df_float)

# transform categorical strings into one hot encoded array
encoder = OneHotEncoder(sparse=False)
onehot = encoder.fit_transform(features_array)

# scale the features
df_features = np.concatenate((float_array, onehot),axis=1)

In [8]:
df_features

array([[1., 0., 0., ..., 0., 0., 1.],
       [3., 2., 0., ..., 1., 0., 0.],
       [1., 1., 0., ..., 0., 1., 0.],
       ...,
       [2., 0., 0., ..., 0., 0., 1.],
       [1., 2., 0., ..., 0., 1., 0.],
       [0., 0., 0., ..., 0., 1., 0.]])

In [9]:
# scale the features
min_max_scaler = MinMaxScaler()
df_features = min_max_scaler.fit_transform(df_features)

In [10]:
# concatenate the respondent id back to the scale feature array
df_features = np.concatenate((int_array,df_features),axis=1)

In [11]:
df_features

array([[0.00000000e+00, 3.33333333e-01, 0.00000000e+00, ...,
        0.00000000e+00, 0.00000000e+00, 1.00000000e+00],
       [1.00000000e+00, 1.00000000e+00, 1.00000000e+00, ...,
        1.00000000e+00, 0.00000000e+00, 0.00000000e+00],
       [3.00000000e+00, 3.33333333e-01, 5.00000000e-01, ...,
        0.00000000e+00, 1.00000000e+00, 0.00000000e+00],
       ...,
       [2.67020000e+04, 6.66666667e-01, 0.00000000e+00, ...,
        0.00000000e+00, 0.00000000e+00, 1.00000000e+00],
       [2.67030000e+04, 3.33333333e-01, 1.00000000e+00, ...,
        0.00000000e+00, 1.00000000e+00, 0.00000000e+00],
       [2.67060000e+04, 0.00000000e+00, 0.00000000e+00, ...,
        0.00000000e+00, 1.00000000e+00, 0.00000000e+00]])

In [12]:
# Separate features and labels into training and test sets
X_train_master, X_test_master, y_train_master, y_test_master = train_test_split(df_features, df_y, random_state=42)

In [13]:
y_train_master[10]

array([14761,     0], dtype=int64)

In [14]:
X_train_master[10]

array([1.47610000e+04, 3.33333333e-01, 1.00000000e+00, 0.00000000e+00,
       0.00000000e+00, 0.00000000e+00, 1.00000000e+00, 0.00000000e+00,
       0.00000000e+00, 0.00000000e+00, 0.00000000e+00, 0.00000000e+00,
       0.00000000e+00, 0.00000000e+00, 0.00000000e+00, 7.50000000e-01,
       2.50000000e-01, 2.50000000e-01, 2.50000000e-01, 2.50000000e-01,
       2.50000000e-01, 0.00000000e+00, 3.33333333e-01, 1.00000000e+00,
       0.00000000e+00, 0.00000000e+00, 0.00000000e+00, 0.00000000e+00,
       0.00000000e+00, 0.00000000e+00, 1.00000000e+00, 0.00000000e+00,
       0.00000000e+00, 0.00000000e+00, 0.00000000e+00, 1.00000000e+00,
       1.00000000e+00, 0.00000000e+00, 1.00000000e+00, 0.00000000e+00,
       0.00000000e+00, 0.00000000e+00, 1.00000000e+00, 1.00000000e+00,
       0.00000000e+00, 1.00000000e+00, 0.00000000e+00, 0.00000000e+00,
       0.00000000e+00, 0.00000000e+00, 0.00000000e+00, 1.00000000e+00,
       0.00000000e+00, 0.00000000e+00, 0.00000000e+00, 0.00000000e+00,
      

In [15]:
X_train_master

array([[8.34500000e+03, 6.66666667e-01, 1.00000000e+00, ...,
        0.00000000e+00, 1.00000000e+00, 0.00000000e+00],
       [2.51590000e+04, 3.33333333e-01, 5.00000000e-01, ...,
        1.00000000e+00, 0.00000000e+00, 0.00000000e+00],
       [2.17780000e+04, 3.33333333e-01, 5.00000000e-01, ...,
        0.00000000e+00, 0.00000000e+00, 1.00000000e+00],
       ...,
       [7.32700000e+03, 0.00000000e+00, 5.00000000e-01, ...,
        0.00000000e+00, 0.00000000e+00, 1.00000000e+00],
       [1.15400000e+03, 6.66666667e-01, 1.00000000e+00, ...,
        1.00000000e+00, 0.00000000e+00, 0.00000000e+00],
       [2.15010000e+04, 0.00000000e+00, 5.00000000e-01, ...,
        1.00000000e+00, 0.00000000e+00, 0.00000000e+00]])

In [16]:
# Remove training labels for model fitting purposes
X_train = X_train_master[:,1:]
X_test = X_test_master[:,1:]
y_train = np.ravel(y_train_master[:,1:])
y_test = np.ravel(y_test_master[:,1:])

In [17]:
#Define Model Instances
model_optimal = RandomForestClassifier(n_estimators = 100, random_state=42)

#Fit Model
model_optimal.fit(X_train,y_train)

# Predict
y_model = model_optimal.predict(X_test)

#get accuracy score
print(accuracy_score(y_test, y_model))
# #Save Model
# filename = '../models/randomForestML.sav'
# joblib.dump(model_optimal, filename)

print("... Done ...")

0.8226430462227652
... Done ...
