In [29]:
#Loading and exploring the data
import pandas as pd

train_features = pd.read_csv('/content/hacthon/training_set_features.csv')
train_labels = pd.read_csv('/content/hacthon/training_set_labels.csv')
test_features = pd.read_csv('/content/hacthon/test_set_features.csv')

print(train_features.head())
print(train_labels.head())
print(train_features.info())
print(train_labels.info())


   respondent_id  xyz_concern  xyz_knowledge  behavioral_antiviral_meds  \
0              0          1.0            0.0                        0.0   
1              1          3.0            2.0                        0.0   
2              2          1.0            1.0                        0.0   
3              3          1.0            1.0                        0.0   
4              4          2.0            1.0                        0.0   

   behavioral_avoidance  behavioral_face_mask  behavioral_wash_hands  \
0                   0.0                   0.0                    0.0   
1                   1.0                   0.0                    1.0   
2                   1.0                   0.0                    0.0   
3                   1.0                   0.0                    1.0   
4                   1.0                   0.0                    1.0   

   behavioral_large_gatherings  behavioral_outside_home  \
0                          0.0                      1.0  

In [17]:
# Merging the labels and feautres csv files
train_data = train_features.merge(train_labels, on='respondent_id')

print(train_data.head())
print(train_data.info())

   respondent_id  xyz_concern  xyz_knowledge  behavioral_antiviral_meds  \
0              0          1.0            0.0                        0.0   
1              1          3.0            2.0                        0.0   
2              2          1.0            1.0                        0.0   
3              3          1.0            1.0                        0.0   
4              4          2.0            1.0                        0.0   

   behavioral_avoidance  behavioral_face_mask  behavioral_wash_hands  \
0                   0.0                   0.0                    0.0   
1                   1.0                   0.0                    1.0   
2                   1.0                   0.0                    0.0   
3                   1.0                   0.0                    1.0   
4                   1.0                   0.0                    1.0   

   behavioral_large_gatherings  behavioral_outside_home  \
0                          0.0                      1.0  

In [20]:
# Data Preprocessing
from sklearn.preprocessing import LabelEncoder, StandardScaler
import numpy as np

# Handling missing values
# Here we are only sleecting the numerical values
numeric_cols = train_data.select_dtypes(include=['number']).columns
train_data[numeric_cols] = train_data[numeric_cols].fillna(train_data[numeric_cols].median())
test_features[numeric_cols] = train_data[numeric_cols].fillna(test_features[numeric_cols].median())

# Encoding the categorical variables
categorical_cols = train_data.select_dtypes(include=['object']).columns
label_encoders = {}

for col in categorical_cols:
    le = LabelEncoder()
    train_data[col] = le.fit_transform(train_data[col].astype(str))
    test_features[col] = le.transform(test_features[col].astype(str))
    label_encoders[col] = le

# Now we will be separating the features and target variables
X = train_data.drop(['respondent_id', 'xyz_vaccine', 'seasonal_vaccine'], axis=1)
y_xyz = train_data['xyz_vaccine']
y_seasonal = train_data['seasonal_vaccine']

In [21]:
# Model traing by Random Forest
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier

# Spliting the data into training and validation sets
X_train, X_val, y_xyz_train, y_xyz_val, y_seasonal_train, y_seasonal_val = train_test_split(
    X, y_xyz, y_seasonal, test_size=0.2, random_state=42)

# Initializing the Random Forest models
model_xyz = RandomForestClassifier(random_state=42)
model_seasonal = RandomForestClassifier(random_state=42)

# Training the models
model_xyz.fit(X_train, y_xyz_train)
model_seasonal.fit(X_train, y_seasonal_train)

In [32]:
from sklearn.metrics import roc_auc_score

# Making the predictions
y_xyz_pred = model_xyz.predict_proba(X_val)[:, 1]
y_seasonal_pred = model_seasonal.predict_proba(X_val)[:, 1]

# Now we are calculating the ROC AUC scores
roc_auc_xyz = roc_auc_score(y_xyz_val, y_xyz_pred)
roc_auc_seasonal = roc_auc_score(y_seasonal_val, y_seasonal_pred)

print(f'ROC AUC for XYZ vaccine: {roc_auc_xyz}')
print(f'ROC AUC for Seasonal vaccine: {roc_auc_seasonal}')

ROC AUC for XYZ vaccine: 0.8291918790812596
ROC AUC for Seasonal vaccine: 0.855460699724987


In [40]:
# Now we make predicitons on the Test Data
# We apply the same preprocessing steps to test data
# We fill the missing values in numeric columns only
test_features[numeric_cols] = test_features[numeric_cols].fillna(test_features[numeric_cols].median())

# Encoding the categorical variables
for col in categorical_cols:
    test_features[col] = label_encoders[col].transform(test_features[col].astype(str))

# Make predictions on the test data using the trained models
xyz_pred_test = model_xyz.predict_proba(test_features.drop(['respondent_id'], axis=1))[:, 1]
seasonal_pred_test = model_seasonal.predict_proba(test_features.drop(['respondent_id'], axis=1))[:, 1]

In [44]:
# We now prepare for submission in csv format
submission = pd.DataFrame({
    'respondent_id': test_features['respondent_id'],
    'xyz_vaccine': xyz_pred_test,
    'seasonal_vaccine': seasonal_pred_test
})

# Save to CSV without the index column
submission.to_csv('submission.csv', index=False)


from google.colab import drive
drive.mount('/content/drive')
submission.to_csv('/content/drive/My Drive/submission.csv', index=False)

Mounted at /content/drive
