In [25]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import OneHotEncoder, MinMaxScaler
from sklearn.model_selection import train_test_split

In [2]:
# import data

df_training = pd.read_csv("../Resources/training_set_features.csv")

df_labels = pd.read_csv("../Resources/training_set_labels.csv")
#merging

df = df_labels.merge(df_training, on = "respondent_id", how = "inner")

In [3]:
df.head()

Unnamed: 0,respondent_id,h1n1_vaccine,seasonal_vaccine,h1n1_concern,h1n1_knowledge,behavioral_antiviral_meds,behavioral_avoidance,behavioral_face_mask,behavioral_wash_hands,behavioral_large_gatherings,...,income_poverty,marital_status,rent_or_own,employment_status,hhs_geo_region,census_msa,household_adults,household_children,employment_industry,employment_occupation
0,0,0,0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,...,Below Poverty,Not Married,Own,Not in Labor Force,oxchjgsf,Non-MSA,0.0,0.0,,
1,1,0,1,3.0,2.0,0.0,1.0,0.0,1.0,0.0,...,Below Poverty,Not Married,Rent,Employed,bhuqouqj,"MSA, Not Principle City",0.0,0.0,pxcmvdjn,xgwztkwe
2,2,0,0,1.0,1.0,0.0,1.0,0.0,0.0,0.0,...,"<= $75,000, Above Poverty",Not Married,Own,Employed,qufhixun,"MSA, Not Principle City",2.0,0.0,rucpziij,xtkaffoo
3,3,0,1,1.0,1.0,0.0,1.0,0.0,1.0,1.0,...,Below Poverty,Not Married,Rent,Not in Labor Force,lrircsnp,"MSA, Principle City",0.0,0.0,,
4,4,0,0,2.0,1.0,0.0,1.0,0.0,1.0,1.0,...,"<= $75,000, Above Poverty",Married,Own,Employed,qufhixun,"MSA, Not Principle City",1.0,0.0,wxleyezf,emcorrxb


In [4]:
#checking for nulls
print(df.isnull().sum())

respondent_id                      0
h1n1_vaccine                       0
seasonal_vaccine                   0
h1n1_concern                      92
h1n1_knowledge                   116
behavioral_antiviral_meds         71
behavioral_avoidance             208
behavioral_face_mask              19
behavioral_wash_hands             42
behavioral_large_gatherings       87
behavioral_outside_home           82
behavioral_touch_face            128
doctor_recc_h1n1                2160
doctor_recc_seasonal            2160
chronic_med_condition            971
child_under_6_months             820
health_worker                    804
health_insurance               12274
opinion_h1n1_vacc_effective      391
opinion_h1n1_risk                388
opinion_h1n1_sick_from_vacc      395
opinion_seas_vacc_effective      462
opinion_seas_risk                514
opinion_seas_sick_from_vacc      537
age_group                          0
education                       1407
race                               0
s

In [5]:
# Clean feature and label dataset
df1 = df.drop(["health_insurance","employment_industry", "employment_occupation"], axis=1)

In [6]:
#dropping nulls
df2 = df1.dropna()

In [7]:
df2.count()

respondent_id                  19642
h1n1_vaccine                   19642
seasonal_vaccine               19642
h1n1_concern                   19642
h1n1_knowledge                 19642
behavioral_antiviral_meds      19642
behavioral_avoidance           19642
behavioral_face_mask           19642
behavioral_wash_hands          19642
behavioral_large_gatherings    19642
behavioral_outside_home        19642
behavioral_touch_face          19642
doctor_recc_h1n1               19642
doctor_recc_seasonal           19642
chronic_med_condition          19642
child_under_6_months           19642
health_worker                  19642
opinion_h1n1_vacc_effective    19642
opinion_h1n1_risk              19642
opinion_h1n1_sick_from_vacc    19642
opinion_seas_vacc_effective    19642
opinion_seas_risk              19642
opinion_seas_sick_from_vacc    19642
age_group                      19642
education                      19642
race                           19642
sex                            19642
i

In [8]:
df2.shape

(19642, 35)

In [9]:
df2.dtypes

respondent_id                    int64
h1n1_vaccine                     int64
seasonal_vaccine                 int64
h1n1_concern                   float64
h1n1_knowledge                 float64
behavioral_antiviral_meds      float64
behavioral_avoidance           float64
behavioral_face_mask           float64
behavioral_wash_hands          float64
behavioral_large_gatherings    float64
behavioral_outside_home        float64
behavioral_touch_face          float64
doctor_recc_h1n1               float64
doctor_recc_seasonal           float64
chronic_med_condition          float64
child_under_6_months           float64
health_worker                  float64
opinion_h1n1_vacc_effective    float64
opinion_h1n1_risk              float64
opinion_h1n1_sick_from_vacc    float64
opinion_seas_vacc_effective    float64
opinion_seas_risk              float64
opinion_seas_sick_from_vacc    float64
age_group                       object
education                       object
race                     

In [10]:
df2.to_csv("cleaned_data.csv")

In [11]:
# extract label dataset
df_y = df2.select_dtypes(include=['int64'])
df_y = df_y[["respondent_id","h1n1_vaccine"]]
df_y = np.array(df_y)

In [12]:
# Clean features dataset

# drop label columns
df_x = df2.drop(["h1n1_vaccine", "seasonal_vaccine"], axis=1)


In [13]:
df_x

Unnamed: 0,respondent_id,h1n1_concern,h1n1_knowledge,behavioral_antiviral_meds,behavioral_avoidance,behavioral_face_mask,behavioral_wash_hands,behavioral_large_gatherings,behavioral_outside_home,behavioral_touch_face,...,race,sex,income_poverty,marital_status,rent_or_own,employment_status,hhs_geo_region,census_msa,household_adults,household_children
0,0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,...,White,Female,Below Poverty,Not Married,Own,Not in Labor Force,oxchjgsf,Non-MSA,0.0,0.0
1,1,3.0,2.0,0.0,1.0,0.0,1.0,0.0,1.0,1.0,...,White,Male,Below Poverty,Not Married,Rent,Employed,bhuqouqj,"MSA, Not Principle City",0.0,0.0
3,3,1.0,1.0,0.0,1.0,0.0,1.0,1.0,0.0,0.0,...,White,Female,Below Poverty,Not Married,Rent,Not in Labor Force,lrircsnp,"MSA, Principle City",0.0,0.0
4,4,2.0,1.0,0.0,1.0,0.0,1.0,1.0,0.0,1.0,...,White,Female,"<= $75,000, Above Poverty",Married,Own,Employed,qufhixun,"MSA, Not Principle City",1.0,0.0
5,5,3.0,1.0,0.0,1.0,0.0,1.0,0.0,0.0,1.0,...,White,Male,"<= $75,000, Above Poverty",Married,Own,Employed,atmpeygn,"MSA, Principle City",2.0,3.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
26700,26700,3.0,1.0,0.0,1.0,0.0,1.0,0.0,0.0,1.0,...,White,Female,"> $75,000",Married,Own,Not in Labor Force,lzgpxyit,"MSA, Principle City",1.0,0.0
26701,26701,2.0,2.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,...,White,Female,"> $75,000",Not Married,Rent,Not in Labor Force,fpwskwrf,"MSA, Principle City",3.0,0.0
26702,26702,2.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,...,White,Female,"<= $75,000, Above Poverty",Not Married,Own,Not in Labor Force,qufhixun,Non-MSA,0.0,0.0
26703,26703,1.0,2.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,...,White,Male,"<= $75,000, Above Poverty",Not Married,Rent,Employed,lzgpxyit,"MSA, Principle City",1.0,0.0


In [14]:
# pull out respondent id, categorical string, and categorical number features
df_int = df_x.select_dtypes(include=["int64"])
df_categories = df_x.select_dtypes(include=['object'])
df_float = df_x.select_dtypes(include=['float64'])

In [15]:
# turn the above df's into arrays
int_array = np.array(df_int)
features_array = np.array(df_categories)
float_array = np.array(df_float)


In [18]:
# transform categorical strings into one hot encoded array
encoder = OneHotEncoder(sparse=False)
onehot = encoder.fit_transform(features_array)

# scale the features
df_features = np.concatenate((float_array, onehot),axis=1)

In [20]:
df_features

array([[1., 0., 0., ..., 0., 0., 1.],
       [3., 2., 0., ..., 1., 0., 0.],
       [1., 1., 0., ..., 0., 1., 0.],
       ...,
       [2., 0., 0., ..., 0., 0., 1.],
       [1., 2., 0., ..., 0., 1., 0.],
       [0., 0., 0., ..., 0., 1., 0.]])

In [21]:
# scale the features
min_max_scaler = MinMaxScaler()
df_features = min_max_scaler.fit_transform(df_features)

In [23]:
# concatenate the respondent id back to the scale feature array
df_features = np.concatenate((int_array,df_features),axis=1)
df_features

array([[0.00000000e+00, 0.00000000e+00, 3.33333333e-01, ...,
        0.00000000e+00, 0.00000000e+00, 1.00000000e+00],
       [1.00000000e+00, 1.00000000e+00, 1.00000000e+00, ...,
        1.00000000e+00, 0.00000000e+00, 0.00000000e+00],
       [3.00000000e+00, 3.00000000e+00, 3.33333333e-01, ...,
        0.00000000e+00, 1.00000000e+00, 0.00000000e+00],
       ...,
       [2.67020000e+04, 2.67020000e+04, 6.66666667e-01, ...,
        0.00000000e+00, 0.00000000e+00, 1.00000000e+00],
       [2.67030000e+04, 2.67030000e+04, 3.33333333e-01, ...,
        0.00000000e+00, 1.00000000e+00, 0.00000000e+00],
       [2.67060000e+04, 2.67060000e+04, 0.00000000e+00, ...,
        0.00000000e+00, 1.00000000e+00, 0.00000000e+00]])

In [26]:
# Separate features and labels into training and test sets
X_train_master, X_test_master, y_train_master, y_test_master = train_test_split(df_features, df_y, random_state=42)

In [27]:
# Remove training labels for model fitting purposes
X_train = X_train_master[:,1:]
X_test = X_test_master[:,1:]
y_train = np.ravel(y_train_master[:,1:])
y_test = np.ravel(y_test_master[:,1:])

In [28]:
#Define Model Instances
from sklearn.ensemble import RandomForestClassifier
model_optimal = RandomForestClassifier(n_estimators = 100, random_state=42)

In [29]:
#Fit Model
model_optimal.fit(X_train,y_train)

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
                       max_depth=None, max_features='auto', max_leaf_nodes=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, n_estimators=100,
                       n_jobs=None, oob_score=False, random_state=42, verbose=0,
                       warm_start=False)

In [30]:
# Predict
y_model = model_optimal.predict(X_test)

In [31]:
features_training_score = round((model_optimal.score(X_train, y_train))*100,3) 
features_accuracy = round((model_optimal.score(X_test, y_test))*100,3)
print(f"Training Data Score: {features_training_score}%")
print(f"Testing Data Score: {features_accuracy}%")

Training Data Score: 99.993%
Testing Data Score: 82.305%


In [32]:
#Predictions for model
predictions =model_optimal.predict(X_test)
classifications = y_test.tolist()

predicted = {
    'Actual': y_test,
    'Prediction': predictions
}

prediction_df = pd.DataFrame(predicted)
prediction_df = prediction_df.set_index('Actual').reset_index()
prediction_df.head(15)

Unnamed: 0,Actual,Prediction
0,0,0
1,0,0
2,0,0
3,0,0
4,0,0
5,0,0
6,0,0
7,0,0
8,1,1
9,0,0


In [33]:
#Save Model
import joblib
filename = 'randomForestML.sav'
joblib.dump(model_optimal, filename)

['randomForestML.sav']

In [34]:
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report
print(accuracy_score(y_test,y_model))
print(classification_report(y_test,predictions))
print(confusion_matrix(y_test,predictions))

0.8230502952555487
              precision    recall  f1-score   support

           0       0.84      0.95      0.89      3803
           1       0.68      0.40      0.51      1108

    accuracy                           0.82      4911
   macro avg       0.76      0.67      0.70      4911
weighted avg       0.81      0.82      0.81      4911

[[3595  208]
 [ 661  447]]


In [35]:
prediction_df.shape

(4911, 2)

In [41]:
y_test_master.shape

(4911, 2)

In [47]:
type(y_test_master)

numpy.ndarray

In [115]:
ids = y_test_master[:,0]

In [69]:
id_df = pd.DataFrame(ids)

In [159]:
results = id_df.join(prediction_df)
results

Unnamed: 0,0,Actual,Prediction
0,9465,0,0
1,1545,0,0
2,4979,0,0
3,20661,0,0
4,22541,0,0
...,...,...,...
4906,5521,0,0
4907,2024,0,0
4908,19745,0,0
4909,5335,0,0


In [157]:
 results.columns

Index([0, 'Actual', 'Prediction'], dtype='object')

In [163]:
results = results.rename({0:'respondent_id'}, axis=1)

In [165]:
results.set_index('respondent_id',inplace=True)

In [166]:
results


Unnamed: 0_level_0,Actual,Prediction
respondent_id,Unnamed: 1_level_1,Unnamed: 2_level_1
9465,0,0
1545,0,0
4979,0,0
20661,0,0
22541,0,0
...,...,...
5521,0,0
2024,0,0
19745,0,0
5335,0,0


In [167]:
new_results_df = df2.merge(results,on='respondent_id',how = 'inner')

In [169]:
new_results_df.shape

(4911, 37)

In [170]:
new_results_df.columns

Index(['respondent_id', 'h1n1_vaccine', 'seasonal_vaccine', 'h1n1_concern',
       'h1n1_knowledge', 'behavioral_antiviral_meds', 'behavioral_avoidance',
       'behavioral_face_mask', 'behavioral_wash_hands',
       'behavioral_large_gatherings', 'behavioral_outside_home',
       'behavioral_touch_face', 'doctor_recc_h1n1', 'doctor_recc_seasonal',
       'chronic_med_condition', 'child_under_6_months', 'health_worker',
       'opinion_h1n1_vacc_effective', 'opinion_h1n1_risk',
       'opinion_h1n1_sick_from_vacc', 'opinion_seas_vacc_effective',
       'opinion_seas_risk', 'opinion_seas_sick_from_vacc', 'age_group',
       'education', 'race', 'sex', 'income_poverty', 'marital_status',
       'rent_or_own', 'employment_status', 'hhs_geo_region', 'census_msa',
       'household_adults', 'household_children', 'Actual', 'Prediction'],
      dtype='object')

In [181]:
new_results_df['h1n1_vaccine'].equals(new_results_df['Actual'])

True

In [182]:
new_prediction_df = new_results_df.drop('Actual',axis = 1)

In [183]:
new_prediction_df.shape

(4911, 36)

In [185]:
random_subset = new_prediction_df.sample(n=30)
random_subset

Unnamed: 0,respondent_id,h1n1_vaccine,seasonal_vaccine,h1n1_concern,h1n1_knowledge,behavioral_antiviral_meds,behavioral_avoidance,behavioral_face_mask,behavioral_wash_hands,behavioral_large_gatherings,...,sex,income_poverty,marital_status,rent_or_own,employment_status,hhs_geo_region,census_msa,household_adults,household_children,Prediction
3250,17680,0,0,1.0,2.0,0.0,1.0,0.0,1.0,0.0,...,Female,"> $75,000",Married,Own,Employed,bhuqouqj,"MSA, Not Principle City",1.0,0.0,0
3154,17173,0,0,2.0,1.0,0.0,1.0,0.0,1.0,0.0,...,Male,"<= $75,000, Above Poverty",Married,Own,Employed,fpwskwrf,"MSA, Not Principle City",2.0,1.0,0
3499,18953,0,1,2.0,2.0,0.0,1.0,0.0,1.0,1.0,...,Female,"<= $75,000, Above Poverty",Not Married,Own,Not in Labor Force,fpwskwrf,"MSA, Not Principle City",0.0,0.0,0
2529,13708,0,1,2.0,2.0,0.0,1.0,0.0,1.0,0.0,...,Female,"> $75,000",Married,Own,Not in Labor Force,oxchjgsf,"MSA, Not Principle City",1.0,0.0,1
1870,10393,0,0,1.0,2.0,0.0,1.0,0.0,1.0,0.0,...,Male,"<= $75,000, Above Poverty",Married,Own,Employed,lrircsnp,"MSA, Not Principle City",1.0,2.0,0
2391,12948,1,1,2.0,1.0,1.0,1.0,0.0,1.0,0.0,...,Male,"<= $75,000, Above Poverty",Not Married,Rent,Not in Labor Force,qufhixun,"MSA, Principle City",1.0,0.0,1
1740,9706,0,0,1.0,1.0,0.0,1.0,0.0,1.0,0.0,...,Female,"<= $75,000, Above Poverty",Married,Own,Not in Labor Force,dqpwygqj,"MSA, Not Principle City",1.0,2.0,0
2059,11337,1,0,2.0,1.0,0.0,1.0,0.0,1.0,0.0,...,Male,"> $75,000",Married,Own,Employed,lzgpxyit,Non-MSA,1.0,1.0,0
1675,9314,0,1,2.0,1.0,0.0,0.0,0.0,1.0,0.0,...,Female,"<= $75,000, Above Poverty",Not Married,Own,Not in Labor Force,lzgpxyit,"MSA, Not Principle City",0.0,0.0,0
4594,24897,0,0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,...,Female,"<= $75,000, Above Poverty",Not Married,Rent,Employed,kbazzjca,"MSA, Principle City",2.0,0.0,0


In [187]:
# Using DataFrame.insert() to add a column 
random_subset.insert(1, "id", range(30), True)

In [188]:
random_subset.shape

(30, 37)

In [189]:
random_subset.head(30)

Unnamed: 0,respondent_id,id,h1n1_vaccine,seasonal_vaccine,h1n1_concern,h1n1_knowledge,behavioral_antiviral_meds,behavioral_avoidance,behavioral_face_mask,behavioral_wash_hands,...,sex,income_poverty,marital_status,rent_or_own,employment_status,hhs_geo_region,census_msa,household_adults,household_children,Prediction
3250,17680,0,0,0,1.0,2.0,0.0,1.0,0.0,1.0,...,Female,"> $75,000",Married,Own,Employed,bhuqouqj,"MSA, Not Principle City",1.0,0.0,0
3154,17173,1,0,0,2.0,1.0,0.0,1.0,0.0,1.0,...,Male,"<= $75,000, Above Poverty",Married,Own,Employed,fpwskwrf,"MSA, Not Principle City",2.0,1.0,0
3499,18953,2,0,1,2.0,2.0,0.0,1.0,0.0,1.0,...,Female,"<= $75,000, Above Poverty",Not Married,Own,Not in Labor Force,fpwskwrf,"MSA, Not Principle City",0.0,0.0,0
2529,13708,3,0,1,2.0,2.0,0.0,1.0,0.0,1.0,...,Female,"> $75,000",Married,Own,Not in Labor Force,oxchjgsf,"MSA, Not Principle City",1.0,0.0,1
1870,10393,4,0,0,1.0,2.0,0.0,1.0,0.0,1.0,...,Male,"<= $75,000, Above Poverty",Married,Own,Employed,lrircsnp,"MSA, Not Principle City",1.0,2.0,0
2391,12948,5,1,1,2.0,1.0,1.0,1.0,0.0,1.0,...,Male,"<= $75,000, Above Poverty",Not Married,Rent,Not in Labor Force,qufhixun,"MSA, Principle City",1.0,0.0,1
1740,9706,6,0,0,1.0,1.0,0.0,1.0,0.0,1.0,...,Female,"<= $75,000, Above Poverty",Married,Own,Not in Labor Force,dqpwygqj,"MSA, Not Principle City",1.0,2.0,0
2059,11337,7,1,0,2.0,1.0,0.0,1.0,0.0,1.0,...,Male,"> $75,000",Married,Own,Employed,lzgpxyit,Non-MSA,1.0,1.0,0
1675,9314,8,0,1,2.0,1.0,0.0,0.0,0.0,1.0,...,Female,"<= $75,000, Above Poverty",Not Married,Own,Not in Labor Force,lzgpxyit,"MSA, Not Principle City",0.0,0.0,0
4594,24897,9,0,0,0.0,1.0,0.0,0.0,0.0,0.0,...,Female,"<= $75,000, Above Poverty",Not Married,Rent,Employed,kbazzjca,"MSA, Principle City",2.0,0.0,0


In [194]:
random_subset.to_csv("sample_prediction_set.csv",index=False)

In [195]:
random_subset.dtypes

respondent_id                    int64
id                               int32
h1n1_vaccine                     int64
seasonal_vaccine                 int64
h1n1_concern                   float64
h1n1_knowledge                 float64
behavioral_antiviral_meds      float64
behavioral_avoidance           float64
behavioral_face_mask           float64
behavioral_wash_hands          float64
behavioral_large_gatherings    float64
behavioral_outside_home        float64
behavioral_touch_face          float64
doctor_recc_h1n1               float64
doctor_recc_seasonal           float64
chronic_med_condition          float64
child_under_6_months           float64
health_worker                  float64
opinion_h1n1_vacc_effective    float64
opinion_h1n1_risk              float64
opinion_h1n1_sick_from_vacc    float64
opinion_seas_vacc_effective    float64
opinion_seas_risk              float64
opinion_seas_sick_from_vacc    float64
age_group                       object
education                

In [196]:
random_subset.shape

(30, 37)