First, we import the the training data sets

training_set_labels and training_set_features

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
# import seaborn as sns


from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier  
from sklearn.metrics import accuracy_score, classification_report
from sklearn.impute import KNNImputer#, IterativeImputer
from sklearn import svm


# %mathplotlib inline

# displays all the columns
pd.set_option('display.max_columns', None)
plt.rcParams["figure.figsize"] = (18, 8);

dataset_features= pd.read_csv('dataset and all/training_set_features.csv')
dataset_labels=pd.read_csv('dataset and all/training_set_labels.csv')

print(dataset_features.shape)
print(dataset_features.columns)

print(dataset_labels.shape)
print(dataset_labels.columns)



(26707, 36)
Index(['respondent_id', 'xyz_concern', 'xyz_knowledge',
       'behavioral_antiviral_meds', 'behavioral_avoidance',
       'behavioral_face_mask', 'behavioral_wash_hands',
       'behavioral_large_gatherings', 'behavioral_outside_home',
       'behavioral_touch_face', 'doctor_recc_xyz', 'doctor_recc_seasonal',
       'chronic_med_condition', 'child_under_6_months', 'health_worker',
       'health_insurance', 'opinion_xyz_vacc_effective', 'opinion_xyz_risk',
       'opinion_xyz_sick_from_vacc', 'opinion_seas_vacc_effective',
       'opinion_seas_risk', 'opinion_seas_sick_from_vacc', 'age_group',
       'education', 'race', 'sex', 'income_poverty', 'marital_status',
       'rent_or_own', 'employment_status', 'hhs_geo_region', 'census_msa',
       'household_adults', 'household_children', 'employment_industry',
       'employment_occupation'],
      dtype='object')
(26707, 3)
Index(['respondent_id', 'xyz_vaccine', 'seasonal_vaccine'], dtype='object')


We have to pre-process the data to convert categorical data to ordinal data, and also fill in any missing data with the appropriate values (Mode for now)
First, let us find the unique catagories in all the columns

In [2]:
for col in dataset_features.columns:
    print(col, dataset_features[col].unique())

respondent_id [    0     1     2 ... 26704 26705 26706]
xyz_concern [ 1.  3.  2.  0. nan]
xyz_knowledge [ 0.  2.  1. nan]
behavioral_antiviral_meds [ 0.  1. nan]
behavioral_avoidance [ 0.  1. nan]
behavioral_face_mask [ 0.  1. nan]
behavioral_wash_hands [ 0.  1. nan]
behavioral_large_gatherings [ 0.  1. nan]
behavioral_outside_home [ 1.  0. nan]
behavioral_touch_face [ 1.  0. nan]
doctor_recc_xyz [ 0. nan  1.]
doctor_recc_seasonal [ 0. nan  1.]
chronic_med_condition [ 0.  1. nan]
child_under_6_months [ 0.  1. nan]
health_worker [ 0.  1. nan]
health_insurance [ 1. nan  0.]
opinion_xyz_vacc_effective [ 3.  5.  4.  2.  1. nan]
opinion_xyz_risk [ 1.  4.  3.  2.  5. nan]
opinion_xyz_sick_from_vacc [ 2.  4.  1.  5.  3. nan]
opinion_seas_vacc_effective [ 2.  4.  5.  3.  1. nan]
opinion_seas_risk [ 1.  2.  4.  3.  5. nan]
opinion_seas_sick_from_vacc [ 2.  4.  1.  5. nan  3.]
age_group ['55 - 64 Years' '35 - 44 Years' '18 - 34 Years' '65+ Years'
 '45 - 54 Years']
education ['< 12 Years' '12 Yea

In [3]:
mapping_age_group = {"18 - 34 Years": 1, "35 - 44 Years": 2, "45 - 54 Years": 3, "55 - 64 Years": 4, "65+ Years": 5}
mapping_race = {"White": 1, "Black": 2, "Hispanic": 3, "Other or Multiple": 4}
mapping_sex = {"Male": 1, "Female": 2}
mapping_income_poverty = {"Below Poverty": 1, "<= $75,000, Above Poverty": 2, "> $75,000": 3}
mapping_marital_status = {"Married": 1, "Not Married": 2}
mapping_rent_or_own = {"Own": 1, "Rent": 2}
mapping_employment_status = {"Employed": 1, "Not in Labor Force": 2, "Unemployed": 3}
mapping_census_msa = {"Non-MSA": 1, "MSA, Not Principle City": 2, "MSA, Principle City": 3}
mapping_household_adults = {0.0: 0, 1.0: 1, 2.0: 2, 3.0: 3, np.nan: -1}
mapping_household_children = {0.0: 0, 1.0: 1, 2.0: 2, 3.0: 3, np.nan: -1}
mapping_geo = {
    'oxchjgsf': 1,
    'bhuqouqj': 2,
    'qufhixun': 3,
    'lrircsnp': 4,
    'atmpeygn': 5,
    'lzgpxyit': 6,
    'fpwskwrf': 7,
    'mlyzmhmf': 8,
    'dqpwygqj': 9,
    'kbazzjca': 10
}
mapping_employment_industry = {
    # np.nan: -1,
    'pxcmvdjn': 1,
    'rucpziij': 2,
    'wxleyezf': 3,
    'saaquncn': 4,
    'xicduogh': 5,
    'ldnlellj': 6,
    'wlfvacwt': 7,
    'nduyfdeo': 8,
    'fcxhlnwr': 9,
    'vjjrobsf': 10,
    'arjwrbjb': 11,
    'atmlpfrs': 12,
    'msuufmds': 13,
    'xqicxuve': 14,
    'phxvnwax': 15,
    'dotnnunm': 16,
    'mfikgejo': 17,
    'cfqqtusy': 18,
    'mcubkhph': 19,
    'haxffmxo': 20,
    'qnlwzans': 21
}
mapping_employment_occupation = {
    # np.nan: -1,
    'xgwztkwe': 1,
    'xtkaffoo': 2,
    'emcorrxb': 3,
    'vlluhbov': 4,
    'xqwwgdyp': 5,
    'ccgxvspp': 6,
    'qxajmpny': 7,
    'kldqjyjy': 8,
    'mxkfnird': 9,
    'hfxkjkmi': 10,
    'bxpfxfdn': 11,
    'ukymxvdu': 12,
    'cmhcxjea': 13,
    'haliazsg': 14,
    'dlvbwzss': 15,
    'xzmlyyjv': 16,
    'oijqvulv': 17,
    'rcertsgn': 18,
    'tfqavkke': 19,
    'hodpvpew': 20,
    'uqqtjvyb': 21,
    'pvmttkik': 22,
    'dcjcmpih': 23
}
mapping_education = {
    '< 12 Years': 1,
    '12 Years': 2,
    'College Graduate': 3,
    'Some College': 4
}

dataset=dataset_features


dataset["age_group"]=dataset.age_group.map(mapping_age_group)
dataset["race"]=dataset.race.map(mapping_race)
dataset["sex"]=dataset.sex.map(mapping_sex)
dataset["income_poverty"] = dataset["income_poverty"].map(mapping_income_poverty)
dataset["marital_status"] = dataset["marital_status"].map(mapping_marital_status)
dataset["rent_or_own"] = dataset["rent_or_own"].map(mapping_rent_or_own)
dataset["employment_status"] = dataset["employment_status"].map(mapping_employment_status)
dataset["census_msa"] = dataset["census_msa"].map(mapping_census_msa)
dataset["household_adults"] = dataset["household_adults"].map(mapping_household_adults)
dataset["household_children"] = dataset["household_children"].map(mapping_household_children)
dataset["hhs_geo_region"] = dataset["hhs_geo_region"].map(mapping_geo)
dataset["employment_industry"] = dataset["employment_industry"].map(mapping_employment_industry)
dataset["employment_occupation"] = dataset["employment_occupation"].map(mapping_employment_occupation)
dataset["education"] = dataset["education"].map(mapping_education)


dataset_features=dataset


print(dataset.columns)
# print(dataset.head)
for col in dataset.columns:
    print(col, dataset[col].unique())


Index(['respondent_id', 'xyz_concern', 'xyz_knowledge',
       'behavioral_antiviral_meds', 'behavioral_avoidance',
       'behavioral_face_mask', 'behavioral_wash_hands',
       'behavioral_large_gatherings', 'behavioral_outside_home',
       'behavioral_touch_face', 'doctor_recc_xyz', 'doctor_recc_seasonal',
       'chronic_med_condition', 'child_under_6_months', 'health_worker',
       'health_insurance', 'opinion_xyz_vacc_effective', 'opinion_xyz_risk',
       'opinion_xyz_sick_from_vacc', 'opinion_seas_vacc_effective',
       'opinion_seas_risk', 'opinion_seas_sick_from_vacc', 'age_group',
       'education', 'race', 'sex', 'income_poverty', 'marital_status',
       'rent_or_own', 'employment_status', 'hhs_geo_region', 'census_msa',
       'household_adults', 'household_children', 'employment_industry',
       'employment_occupation'],
      dtype='object')
respondent_id [    0     1     2 ... 26704 26705 26706]
xyz_concern [ 1.  3.  2.  0. nan]
xyz_knowledge [ 0.  2.  1. nan]
beh

Handling Null values with necessary method

In [4]:
dataset=dataset_features
for column in dataset.columns:
#     selected_value = dataset[column].mode()[0]
#     # selected_value=dataset[column].mean()
    selected_value=-1
    dataset[column].fillna(selected_value, inplace=True)


# imputer = KNNImputer(n_neighbors=5, weights='uniform', metric='nan_euclidean')
# imputer.fit(dataset)
# dataset=imputer.transform(dataset)

# imputer=IterativeImputer()
# imputer.fit(dataset)
# dataset=imputer.transform(dataset)



# print(dataset.head)
print(dataset.isnull().sum())
# print('Missing: %d' % sum(np.isnan(dataset).flatten()))

dataset_features=dataset

respondent_id                  0
xyz_concern                    0
xyz_knowledge                  0
behavioral_antiviral_meds      0
behavioral_avoidance           0
behavioral_face_mask           0
behavioral_wash_hands          0
behavioral_large_gatherings    0
behavioral_outside_home        0
behavioral_touch_face          0
doctor_recc_xyz                0
doctor_recc_seasonal           0
chronic_med_condition          0
child_under_6_months           0
health_worker                  0
health_insurance               0
opinion_xyz_vacc_effective     0
opinion_xyz_risk               0
opinion_xyz_sick_from_vacc     0
opinion_seas_vacc_effective    0
opinion_seas_risk              0
opinion_seas_sick_from_vacc    0
age_group                      0
education                      0
race                           0
sex                            0
income_poverty                 0
marital_status                 0
rent_or_own                    0
employment_status              0
hhs_geo_re

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  dataset[column].fillna(selected_value, inplace=True)


We can now split the dataset into training and testing sets. We will use 80% of the data for training and 20% for testing.

In [5]:
y1=dataset_labels['xyz_vaccine']
y2=dataset_labels['seasonal_vaccine']
X_train, X_test, y1_train, y1_test, y2_train, y2_test = train_test_split(dataset_features, y1, y2, test_size=0.2, random_state=30)

Standardize the Features (if necessary)

Standardizing can be beneficial for logistic regression


In [6]:
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)


We can now train the model on the dataset using the model of your choice

In [7]:
# # Model for the first target variable
# model1 = RandomForestClassifier(random_state=42)
# model1.fit(X_train, y1_train)

# # Model for the second target variable
# model2 = RandomForestClassifier(random_state=42)
# model2.fit(X_train, y2_train)

We can now predict and test the model using the testing data set

In [8]:
# # Predictions for the first target variable
# y1_pred = model1.predict(X_test)

# # Predictions for the second target variable
# y2_pred = model2.predict(X_test)


# # Evaluation for the first target variable
# print("Accuracy for the first target variable:", accuracy_score(y1_test, y1_pred))
# print("Classification report for the first target variable:\n", classification_report(y1_test, y1_pred))

# # Evaluation for the second target variable
# print("Accuracy for the second target variable:", accuracy_score(y2_test, y2_pred))
# print("Classification report for the second target variable:\n", classification_report(y2_test, y2_pred))


We can also test the roc score for the models

In [9]:
from sklearn.metrics import roc_auc_score

# Initialize the models
logistic_model1 = LogisticRegression(random_state=42)
logistic_model2 = LogisticRegression(random_state=42)
rf_model1 = RandomForestClassifier(random_state=42)
rf_model2 = RandomForestClassifier(random_state=42)
# svm_model1 = svm.SVC(probability=True)
# svm_model2 = svm.SVC(probability=True)

# Train the models on the training data
logistic_model1.fit(X_train, y1_train)
logistic_model2.fit(X_train, y2_train)
rf_model1.fit(X_train, y1_train)
rf_model2.fit(X_train, y2_train)
# svm_model1.fit(X_train, y1_train)
# svm_model2.fit(X_train, y2_train)

# Make predictions on the test data
y1_pred_logistic = logistic_model1.predict_proba(X_test)[:, 1]
y2_pred_logistic = logistic_model2.predict_proba(X_test)[:, 1]
y1_pred_rf = rf_model1.predict_proba(X_test)[:, 1]
y2_pred_rf = rf_model2.predict_proba(X_test)[:, 1]
# y1_pred_svm = svm_model1.predict_proba(X_test)[:, 1]
# y2_pred_svm = svm_model2.predict_proba(X_test)[:, 1]

# Calculate ROC AUC scores
roc_auc1_logistic = roc_auc_score(y1_test, y1_pred_logistic)
roc_auc2_logistic = roc_auc_score(y2_test, y2_pred_logistic)
roc_auc1_rf = roc_auc_score(y1_test, y1_pred_rf)
roc_auc2_rf = roc_auc_score(y2_test, y2_pred_rf)
# roc_auc1_svm = roc_auc_score(y1_test, y1_pred_svm)
# roc_auc2_svm = roc_auc_score(y2_test, y2_pred_svm)


print(f"ROC AUC for target_var1 with Logistic Regression: {roc_auc1_logistic}")
print(f"ROC AUC for target_var2 with Logistic Regression: {roc_auc2_logistic}")
print(f"ROC AUC for target_var1 with Random Forest: {roc_auc1_rf}")
print(f"ROC AUC for target_var2 with Random Forest: {roc_auc2_rf}")
# print(f"ROC AUC for target_var1 with SVM: {roc_auc1_svm}")
# print(f"ROC AUC for target_var2 with SVM: {roc_auc2_svm}")

ROC AUC for target_var1 with Logistic Regression: 0.8428415671948233
ROC AUC for target_var2 with Logistic Regression: 0.8305849486194827
ROC AUC for target_var1 with Random Forest: 0.8636247207951577
ROC AUC for target_var2 with Random Forest: 0.8587782017403549


We can tune the hyperparameters to get better performance

In [10]:
# from sklearn.model_selection import GridSearchCV

# # Define the parameter grid
# param_grid = {
#     'n_estimators': [100, 200, 300],
#     'max_features': [ 'sqrt', 'log2'],
#     'max_depth': [None, 10, 20, 30],
#     'min_samples_split': [2, 5, 10],
#     'min_samples_leaf': [1, 2, 4],
#     'bootstrap': [True, False]
# }

# # Initialize the Random Forest model
# rf_model = RandomForestClassifier(random_state=42)

# # Initialize the GridSearchCV object
# grid_search = GridSearchCV(estimator=rf_model, param_grid=param_grid, cv=3, scoring='roc_auc', n_jobs=-1, verbose=1)

# # Fit the grid search to the data
# grid_search.fit(X_train, y1_train)

# # Print the best parameters and best ROC AUC score
# print(f"Best parameters: {grid_search.best_params_}")
# print(f"Best ROC AUC: {grid_search.best_score_}")


In [11]:
from sklearn.model_selection import RandomizedSearchCV
import numpy as np

# Define the parameter grid for hyperparameter tuning
param_distributions = {
    'n_estimators': [100, 200, 300],
    'max_features': ['auto', 'sqrt', 'log2'],
    'max_depth': [None, 10, 20, 30],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4],
    'bootstrap': [True, False]
}

# Initialize the Random Forest model
rf_model = RandomForestClassifier(random_state=42)

# Initialize the RandomizedSearchCV object
random_search = RandomizedSearchCV(estimator=rf_model, param_distributions=param_distributions, 
                                   n_iter=50, cv=5, scoring='roc_auc', n_jobs=-1, verbose=1, random_state=42)

# Fit the randomized search to the data for target_var1
random_search.fit(X_train, y1_train)

# Print the best parameters and best ROC AUC score for target_var1
print(f"Best parameters for target_var1: {random_search.best_params_}")
print(f"Best ROC AUC for target_var1: {random_search.best_score_}")

# Use the best model to predict and calculate ROC AUC for target_var1
best_rf_model = random_search.best_estimator_
y1_pred_best_rf = best_rf_model.predict_proba(X_test)[:, 1]
roc_auc1_best_rf = roc_auc_score(y1_test, y1_pred_best_rf)

print(f"Best ROC AUC for target_var1 with tuned Random Forest: {roc_auc1_best_rf}")

# Repeat the random search for target_var2
random_search.fit(X_train, y2_train)

# Print the best parameters and best ROC AUC score for target_var2
print(f"Best parameters for target_var2: {random_search.best_params_}")
print(f"Best ROC AUC for target_var2: {random_search.best_score_}")

# Use the best model to predict and calculate ROC AUC for target_var2
best_rf_model = random_search.best_estimator_
y2_pred_best_rf = best_rf_model.predict_proba(X_test)[:, 1]
roc_auc2_best_rf = roc_auc_score(y2_test, y2_pred_best_rf)

print(f"Best ROC AUC for target_var2 with tuned Random Forest: {roc_auc2_best_rf}")


Fitting 5 folds for each of 50 candidates, totalling 250 fits
[CV] END bootstrap=False, max_depth=30, max_features=log2, min_samples_leaf=2, min_samples_split=5, n_estimators=300; total time=  10.0s
[CV] END bootstrap=False, max_depth=30, max_features=log2, min_samples_leaf=2, min_samples_split=5, n_estimators=300; total time=  10.2s
[CV] END bootstrap=False, max_depth=30, max_features=log2, min_samples_leaf=2, min_samples_split=5, n_estimators=300; total time=  10.4s
[CV] END bootstrap=False, max_depth=30, max_features=log2, min_samples_leaf=2, min_samples_split=5, n_estimators=300; total time=  10.6s
[CV] END bootstrap=True, max_depth=20, max_features=log2, min_samples_leaf=1, min_samples_split=5, n_estimators=200; total time=   4.9s
[CV] END bootstrap=True, max_depth=20, max_features=log2, min_samples_leaf=1, min_samples_split=5, n_estimators=200; total time=   4.8s
[CV] END bootstrap=False, max_depth=30, max_features=log2, min_samples_leaf=2, min_samples_split=5, n_estimators=300; 