In [2]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import roc_auc_score

In [3]:
data = pd.read_csv('test_set_features.csv')
print(data.columns)
print(data.shape)

Index(['respondent_id', 'xyz_concern', 'xyz_knowledge',
       'behavioral_antiviral_meds', 'behavioral_avoidance',
       'behavioral_face_mask', 'behavioral_wash_hands',
       'behavioral_large_gatherings', 'behavioral_outside_home',
       'behavioral_touch_face', 'doctor_recc_xyz', 'doctor_recc_seasonal',
       'chronic_med_condition', 'child_under_6_months', 'health_worker',
       'health_insurance', 'opinion_xyz_vacc_effective', 'opinion_xyz_risk',
       'opinion_xyz_sick_from_vacc', 'opinion_seas_vacc_effective',
       'opinion_seas_risk', 'opinion_seas_sick_from_vacc', 'age_group',
       'education', 'race', 'sex', 'income_poverty', 'marital_status',
       'rent_or_own', 'employment_status', 'hhs_geo_region', 'census_msa',
       'household_adults', 'household_children', 'employment_industry',
       'employment_occupation'],
      dtype='object')
(26708, 36)


In [4]:
threshold = 0.40

# Find columns with more than 40% missing values
columns_to_drop = data.columns[data.isna().mean() > threshold]

# Drop those columns
data.drop(columns=columns_to_drop, inplace=True)

# Optional: Reset index if needed (not necessary for this operation)
# df.reset_index(drop=True, inplace=True)

# Display the DataFrame after dropping columns
print("DataFrame after dropping columns with more than 40% missing values:")
print(data)

DataFrame after dropping columns with more than 40% missing values:
       respondent_id  xyz_concern  xyz_knowledge  behavioral_antiviral_meds  \
0              26707          2.0            2.0                        0.0   
1              26708          1.0            1.0                        0.0   
2              26709          2.0            2.0                        0.0   
3              26710          1.0            1.0                        0.0   
4              26711          3.0            1.0                        1.0   
...              ...          ...            ...                        ...   
26703          53410          1.0            1.0                        0.0   
26704          53411          3.0            1.0                        0.0   
26705          53412          0.0            1.0                        0.0   
26706          53413          3.0            1.0                        0.0   
26707          53414          2.0            1.0               

In [5]:
numerical_cols = data.select_dtypes(include='number').columns
imputer = SimpleImputer(strategy='median')
data[numerical_cols] = imputer.fit_transform(data[numerical_cols])

# Example for categorical columns (using mode imputation)
categorical_cols = data.select_dtypes(include='object').columns
imputer = SimpleImputer(strategy='most_frequent')
data[categorical_cols] = imputer.fit_transform(data[categorical_cols])
print(data.head(5))

   respondent_id  xyz_concern  xyz_knowledge  behavioral_antiviral_meds  \
0        26707.0          2.0            2.0                        0.0   
1        26708.0          1.0            1.0                        0.0   
2        26709.0          2.0            2.0                        0.0   
3        26710.0          1.0            1.0                        0.0   
4        26711.0          3.0            1.0                        1.0   

   behavioral_avoidance  behavioral_face_mask  behavioral_wash_hands  \
0                   1.0                   0.0                    1.0   
1                   0.0                   0.0                    0.0   
2                   0.0                   1.0                    1.0   
3                   0.0                   0.0                    0.0   
4                   1.0                   0.0                    1.0   

   behavioral_large_gatherings  behavioral_outside_home  \
0                          1.0                      0.0  

In [6]:

from sklearn.preprocessing import LabelEncoder

# Initialize LabelEncoder
label_encoder = LabelEncoder()

# Iterate through each categorical column
for col in ['education', 'age_group']:
    data[col] = label_encoder.fit_transform(data[col])
    

# Perform one-hot encoding using pandas get_dummies function
data = pd.get_dummies(data, columns=['census_msa', 'marital_status',
       'rent_or_own', 'employment_status', 'hhs_geo_region','race', 'sex'])
    
print(data)


       respondent_id  xyz_concern  xyz_knowledge  behavioral_antiviral_meds  \
0            26707.0          2.0            2.0                        0.0   
1            26708.0          1.0            1.0                        0.0   
2            26709.0          2.0            2.0                        0.0   
3            26710.0          1.0            1.0                        0.0   
4            26711.0          3.0            1.0                        1.0   
...              ...          ...            ...                        ...   
26703        53410.0          1.0            1.0                        0.0   
26704        53411.0          3.0            1.0                        0.0   
26705        53412.0          0.0            1.0                        0.0   
26706        53413.0          3.0            1.0                        0.0   
26707        53414.0          2.0            1.0                        0.0   

       behavioral_avoidance  behavioral_face_mask  

In [7]:
# Replace True with 1 and False with 0
for col in data.select_dtypes(include=['bool']).columns:
    data[col] = data[col].astype(int)
print(data.head)

<bound method NDFrame.head of        respondent_id  xyz_concern  xyz_knowledge  behavioral_antiviral_meds  \
0            26707.0          2.0            2.0                        0.0   
1            26708.0          1.0            1.0                        0.0   
2            26709.0          2.0            2.0                        0.0   
3            26710.0          1.0            1.0                        0.0   
4            26711.0          3.0            1.0                        1.0   
...              ...          ...            ...                        ...   
26703        53410.0          1.0            1.0                        0.0   
26704        53411.0          3.0            1.0                        0.0   
26705        53412.0          0.0            1.0                        0.0   
26706        53413.0          3.0            1.0                        0.0   
26707        53414.0          2.0            1.0                        0.0   

       behavioral_avo

In [10]:
"""data['income_poverty'] = data['income_poverty'].replace({
    'Below Poverty': 0,
    '<= $75,000, Above Poverty': 1,
    '> $75,000': 1
})
"""

yo =data['income_poverty'].unique()
print(yo)

[1 0]


In [11]:
data.to_csv('modified_test_set_features.csv', index=False)