In [23]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import roc_auc_score

In [24]:
data = pd.read_csv('training_set_features.csv')
print(data.columns)
print(data.shape)

Index(['respondent_id', 'xyz_concern', 'xyz_knowledge',
       'behavioral_antiviral_meds', 'behavioral_avoidance',
       'behavioral_face_mask', 'behavioral_wash_hands',
       'behavioral_large_gatherings', 'behavioral_outside_home',
       'behavioral_touch_face', 'doctor_recc_xyz', 'doctor_recc_seasonal',
       'chronic_med_condition', 'child_under_6_months', 'health_worker',
       'health_insurance', 'opinion_xyz_vacc_effective', 'opinion_xyz_risk',
       'opinion_xyz_sick_from_vacc', 'opinion_seas_vacc_effective',
       'opinion_seas_risk', 'opinion_seas_sick_from_vacc', 'age_group',
       'education', 'race', 'sex', 'income_poverty', 'marital_status',
       'rent_or_own', 'employment_status', 'hhs_geo_region', 'census_msa',
       'household_adults', 'household_children', 'employment_industry',
       'employment_occupation'],
      dtype='object')
(26707, 36)


In [28]:
threshold = 0.40

# Find columns with more than 40% missing values
columns_to_drop = data.columns[data.isna().mean() > threshold]

# Drop those columns
data.drop(columns=columns_to_drop, inplace=True)

# Optional: Reset index if needed (not necessary for this operation)
# df.reset_index(drop=True, inplace=True)

# Display the DataFrame after dropping columns
print("DataFrame after dropping columns with more than 40% missing values:")
print(data)

DataFrame after dropping columns with more than 40% missing values:
       respondent_id  xyz_concern  xyz_knowledge  behavioral_antiviral_meds  \
0                  0          1.0            0.0                        0.0   
1                  1          3.0            2.0                        0.0   
2                  2          1.0            1.0                        0.0   
3                  3          1.0            1.0                        0.0   
4                  4          2.0            1.0                        0.0   
...              ...          ...            ...                        ...   
26702          26702          2.0            0.0                        0.0   
26703          26703          1.0            2.0                        0.0   
26704          26704          2.0            2.0                        0.0   
26705          26705          1.0            1.0                        0.0   
26706          26706          0.0            0.0               

In [29]:
numerical_cols = data.select_dtypes(include='number').columns
imputer = SimpleImputer(strategy='median')
data[numerical_cols] = imputer.fit_transform(data[numerical_cols])

# Example for categorical columns (using mode imputation)
categorical_cols = data.select_dtypes(include='object').columns
imputer = SimpleImputer(strategy='most_frequent')
data[categorical_cols] = imputer.fit_transform(data[categorical_cols])
print(data.head(5))

   respondent_id  xyz_concern  xyz_knowledge  behavioral_antiviral_meds  \
0            0.0          1.0            0.0                        0.0   
1            1.0          3.0            2.0                        0.0   
2            2.0          1.0            1.0                        0.0   
3            3.0          1.0            1.0                        0.0   
4            4.0          2.0            1.0                        0.0   

   behavioral_avoidance  behavioral_face_mask  behavioral_wash_hands  \
0                   0.0                   0.0                    0.0   
1                   1.0                   0.0                    1.0   
2                   1.0                   0.0                    0.0   
3                   1.0                   0.0                    1.0   
4                   1.0                   0.0                    1.0   

   behavioral_large_gatherings  behavioral_outside_home  \
0                          0.0                      1.0  

In [30]:

from sklearn.preprocessing import LabelEncoder

# Initialize LabelEncoder
label_encoder = LabelEncoder()

# Iterate through each categorical column
for col in ['education', 'age_group']:
    data[col] = label_encoder.fit_transform(data[col])
    

# Perform one-hot encoding using pandas get_dummies function
data = pd.get_dummies(data, columns=['census_msa', 'marital_status',
       'rent_or_own', 'employment_status', 'hhs_geo_region','race', 'sex'])
    
print(data)


       respondent_id  xyz_concern  xyz_knowledge  behavioral_antiviral_meds  \
0                0.0          1.0            0.0                        0.0   
1                1.0          3.0            2.0                        0.0   
2                2.0          1.0            1.0                        0.0   
3                3.0          1.0            1.0                        0.0   
4                4.0          2.0            1.0                        0.0   
...              ...          ...            ...                        ...   
26702        26702.0          2.0            0.0                        0.0   
26703        26703.0          1.0            2.0                        0.0   
26704        26704.0          2.0            2.0                        0.0   
26705        26705.0          1.0            1.0                        0.0   
26706        26706.0          0.0            0.0                        0.0   

       behavioral_avoidance  behavioral_face_mask  

In [31]:
# Replace True with 1 and False with 0
for col in data.select_dtypes(include=['bool']).columns:
    data[col] = data[col].astype(int)
print(data.head)

<bound method NDFrame.head of        respondent_id  xyz_concern  xyz_knowledge  behavioral_antiviral_meds  \
0                0.0          1.0            0.0                        0.0   
1                1.0          3.0            2.0                        0.0   
2                2.0          1.0            1.0                        0.0   
3                3.0          1.0            1.0                        0.0   
4                4.0          2.0            1.0                        0.0   
...              ...          ...            ...                        ...   
26702        26702.0          2.0            0.0                        0.0   
26703        26703.0          1.0            2.0                        0.0   
26704        26704.0          2.0            2.0                        0.0   
26705        26705.0          1.0            1.0                        0.0   
26706        26706.0          0.0            0.0                        0.0   

       behavioral_avo

In [38]:
"""data['income_poverty'] = data['income_poverty'].replace({
    'Below Poverty': 0,
    '<= $75,000, Above Poverty': 1,
    '> $75,000': 1
})
"""

yo =data['income_poverty'].unique()
print(yo)

[0 1]


In [45]:
df = pd.read_csv('training_set_labels.csv')

print(df.head)

<bound method NDFrame.head of        respondent_id  xyz_vaccine  seasonal_vaccine
0                  0            0                 0
1                  1            0                 1
2                  2            0                 0
3                  3            0                 1
4                  4            0                 0
...              ...          ...               ...
26702          26702            0                 0
26703          26703            0                 0
26704          26704            0                 1
26705          26705            0                 0
26706          26706            0                 0

[26707 rows x 3 columns]>


In [47]:
# Merge on respondent_id
df1 = pd.merge(data, df, on='respondent_id')

print(df1)

       respondent_id  xyz_concern  xyz_knowledge  behavioral_antiviral_meds  \
0                0.0          1.0            0.0                        0.0   
1                1.0          3.0            2.0                        0.0   
2                2.0          1.0            1.0                        0.0   
3                3.0          1.0            1.0                        0.0   
4                4.0          2.0            1.0                        0.0   
...              ...          ...            ...                        ...   
26702        26702.0          2.0            0.0                        0.0   
26703        26703.0          1.0            2.0                        0.0   
26704        26704.0          2.0            2.0                        0.0   
26705        26705.0          1.0            1.0                        0.0   
26706        26706.0          0.0            0.0                        0.0   

       behavioral_avoidance  behavioral_face_mask  

In [48]:
# Separate features and target variables
X_xyz = df1.drop(['respondent_id', 'xyz_vaccine', 'seasonal_vaccine', 'doctor_recc_seasonal', 'opinion_seas_vacc_effective',
       'opinion_seas_risk', 'opinion_seas_sick_from_vacc'], axis=1)
X_seasonal = df1.drop(['respondent_id', 'xyz_vaccine', 'seasonal_vaccine', 'xyz_concern', 'xyz_knowledge', 'doctor_recc_xyz', 'opinion_xyz_vacc_effective', 'opinion_xyz_risk',
       'opinion_xyz_sick_from_vacc'], axis=1)
y_xyz = df1['xyz_vaccine']
y_seasonal = df1['seasonal_vaccine']

In [49]:
# Split the data into training and testing sets
X_xyz_train, X_xyz_test, X_seasonal_train, X_seasonal_test, y_train_xyz, y_test_xyz, y_train_seasonal, y_test_seasonal = train_test_split(
    X_xyz, X_seasonal, y_xyz, y_seasonal, test_size=0.2, random_state=42
)

In [50]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import roc_auc_score

# Initialize logistic regression models
model_xyz = LogisticRegression(random_state=42)
model_seasonal = LogisticRegression(random_state=42)

# Train the models
model_xyz.fit(X_xyz_train, y_train_xyz)
model_seasonal.fit(X_seasonal_train, y_train_seasonal)

In [51]:
y_pred_proba_xyz = model_xyz.predict_proba(X_xyz_test)[:, 1]
y_pred_proba_seasonal = model_seasonal.predict_proba(X_seasonal_test)[:, 1]

# Calculate ROC AUC scores
roc_auc_xyz = roc_auc_score(y_test_xyz, y_pred_proba_xyz)
roc_auc_seasonal = roc_auc_score(y_test_seasonal, y_pred_proba_seasonal)

print(f'ROC AUC for xyz_vaccine: {roc_auc_xyz}')
print(f'ROC AUC for seasonal_vaccine: {roc_auc_seasonal}')

ROC AUC for xyz_vaccine: 0.8224774979199758
ROC AUC for seasonal_vaccine: 0.8474197346511163


In [52]:
train = pd.read_csv('modified_test_set_features.csv')
print(train.columns)
print(train.shape)

Index(['respondent_id', 'xyz_concern', 'xyz_knowledge',
       'behavioral_antiviral_meds', 'behavioral_avoidance',
       'behavioral_face_mask', 'behavioral_wash_hands',
       'behavioral_large_gatherings', 'behavioral_outside_home',
       'behavioral_touch_face', 'doctor_recc_xyz', 'doctor_recc_seasonal',
       'chronic_med_condition', 'child_under_6_months', 'health_worker',
       'opinion_xyz_vacc_effective', 'opinion_xyz_risk',
       'opinion_xyz_sick_from_vacc', 'opinion_seas_vacc_effective',
       'opinion_seas_risk', 'opinion_seas_sick_from_vacc', 'age_group',
       'education', 'income_poverty', 'household_adults', 'household_children',
       'census_msa_MSA, Not Principle  City', 'census_msa_MSA, Principle City',
       'census_msa_Non-MSA', 'marital_status_Married',
       'marital_status_Not Married', 'rent_or_own_Own', 'rent_or_own_Rent',
       'employment_status_Employed', 'employment_status_Not in Labor Force',
       'employment_status_Unemployed', 'hhs_geo_re

In [53]:
tX_xyz = train.drop(['respondent_id', 'doctor_recc_seasonal', 'opinion_seas_vacc_effective',
       'opinion_seas_risk', 'opinion_seas_sick_from_vacc'], axis=1)
tX_seasonal = train.drop(['respondent_id', 'xyz_concern', 'xyz_knowledge', 'doctor_recc_xyz', 'opinion_xyz_vacc_effective', 'opinion_xyz_risk',
       'opinion_xyz_sick_from_vacc'], axis=1)


In [54]:
# Predict probabilities on the test set
ty_test_pred_proba_xyz = model_xyz.predict_proba(tX_xyz)[:, 1]
ty_test_pred_proba_seasonal = model_seasonal.predict_proba(tX_seasonal)[:, 1]

# Prepare the submission file
submission = pd.DataFrame({
    'respondent_id': train.loc[tX_xyz.index, 'respondent_id'],
    'xyz_vaccine': ty_test_pred_proba_xyz,
    'seasonal_vaccine': ty_test_pred_proba_seasonal
})

# Save the submission file
submission.to_csv('submission.csv', index=False)