In [5]:
# Importing libraries
import pandas as pd
import numpy as np
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV

# Setting random seed
np.random.seed(42)

# Reading training features
data = pd.read_csv("data/training_set_features.csv")

# Reading training labels
labels = pd.read_csv("data/training_set_labels.csv")

# Adding training labels with training features
data["xyz_vaccine"] = labels["xyz_vaccine"]
data["seasonal_vaccine"] = labels["seasonal_vaccine"]

# Filling a column with mean values
data["health_insurance"] = data["health_insurance"].fillna(np.mean(data["health_insurance"]))

# Dropping columns with a lot of NaN values
# toDrop = ["employment_industry", "employment_occupation", "health_insurance"]
toDrop = ["employment_industry", "employment_occupation"]
dropped_data = data.drop(toDrop, axis=1)

# Initialising X and y as the feature variables and target variables
X = dropped_data.drop(["xyz_vaccine", "seasonal_vaccine", "respondent_id"], axis=1)
y = dropped_data[["xyz_vaccine", "seasonal_vaccine"]]


# Doing required transformations
categorical_features = ["age_group", "education", "race", "sex", "income_poverty", "marital_status", "rent_or_own", "employment_status", "hhs_geo_region", "census_msa"]
onehot = OneHotEncoder()
transformer = ColumnTransformer([("onehot", onehot, categorical_features)], remainder="passthrough")
transformed_X = pd.DataFrame(transformer.fit_transform(X))

# Splitting Data
X_train, X_test, y_train, y_test = train_test_split(transformed_X, y, test_size=0.2)

# Initialising RandomForestClassifier
clf = RandomForestClassifier(n_estimators=100)

# Making grid of hyperparameters for GridSearchCV
grid = {
    'n_estimators': [50, 100],
    'max_depth': [None, 10],
    'min_samples_split': [40, 60],
    'min_samples_leaf': [5, 10, 20]
}

# Doing Grid Search
gs_clf = GridSearchCV(estimator=clf,
                      param_grid=grid,
                      cv=5,
                      scoring=r'roc_auc',
                      n_jobs=3,
                      verbose=2)

gs_clf.fit(transformed_X, y)

Fitting 5 folds for each of 24 candidates, totalling 120 fits


In [8]:
best_params = gs_clf.best_params_
print(f"Best Parameters : {best_params}")

roc_auc_score = gs_clf.best_score_
print(f"roc-auc score : {roc_auc_score:.2f}")

Best Parameters : {'max_depth': None, 'min_samples_leaf': 5, 'min_samples_split': 40, 'n_estimators': 100}
roc-auc score : 0.86


In [34]:
test_features = pd.read_csv("data/test_set_features.csv")
test_features = test_features.drop(["respondent_id", "employment_industry", "employment_occupation"], axis=1)

categorical_features = ["age_group", "education", "race", "sex", "income_poverty", "marital_status", "rent_or_own", "employment_status", "hhs_geo_region", "census_msa"]
onehot = OneHotEncoder()
transformer = ColumnTransformer([("onehot", onehot, categorical_features)], remainder="passthrough")
transformed_features = pd.DataFrame(transformer.fit_transform(test_features))

predictions = gs_clf.predict_proba(transformed_features)
predictions_xyz = pd.DataFrame(predictions[0],
                               columns=["Yes", "No"])
predictions_seasonal = pd.DataFrame(predictions[1],
                                    columns=["Yes", "No"])
predictions_xyz.index = predictions_xyz.index+1
predictions_seasonal.index = predictions_seasonal.index+1

predictions_xyz.to_csv("xyz_predictions")
predictions_seasonal.to_csv("seasonal_predictions")

In [22]:
import pickle

# Save an existing model to file
pickle.dump(gs_clf, open("final_model.pkl", "wb"))
submission.to_csv("answer")