In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline

In [2]:
train_features = pd.read_csv(r'C:\Users\Manoj\Desktop\dataset and all1\training_set_features.csv')
train_labels = pd.read_csv(r'C:\Users\Manoj\Desktop\dataset and all1\training_set_labels.csv')
test_features = pd.read_csv(r'C:\Users\Manoj\Desktop\dataset and all1\test_set_features.csv')


In [3]:
train_respondent_id = train_features['respondent_id']
test_respondent_id = test_features['respondent_id']

In [4]:
train_features = train_features.drop(['respondent_id'], axis=1)
test_features = test_features.drop(['respondent_id'], axis=1)

In [5]:
num_cols = train_features.select_dtypes(include=['float64', 'int64']).columns
cat_cols = train_features.select_dtypes(include=['object']).columns

In [6]:
num_imputer = SimpleImputer(strategy='mean')
cat_imputer = SimpleImputer(strategy='most_frequent')

train_features[num_cols] = num_imputer.fit_transform(train_features[num_cols])
train_features[cat_cols] = cat_imputer.fit_transform(train_features[cat_cols])

test_features[num_cols] = num_imputer.transform(test_features[num_cols])
test_features[cat_cols] = cat_imputer.transform(test_features[cat_cols])


In [7]:
encoder = OneHotEncoder(drop='first', sparse_output=False)

train_features_encoded = pd.DataFrame(encoder.fit_transform(train_features[cat_cols]))
train_features_encoded.columns = encoder.get_feature_names_out(cat_cols)
train_features = train_features.drop(cat_cols, axis=1)
train_features = pd.concat([train_features, train_features_encoded], axis=1)

test_features_encoded = pd.DataFrame(encoder.transform(test_features[cat_cols]))
test_features_encoded.columns = encoder.get_feature_names_out(cat_cols)
test_features = test_features.drop(cat_cols, axis=1)
test_features = pd.concat([test_features, test_features_encoded], axis=1)

In [8]:
scaler = StandardScaler()
train_features[num_cols] = scaler.fit_transform(train_features[num_cols])
test_features[num_cols] = scaler.transform(test_features[num_cols])

In [12]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, roc_auc_score


In [13]:
y = train_labels[['xyz_vaccine', 'seasonal_vaccine']]


In [14]:
X_train, X_val, y_train, y_val = train_test_split(train_features, y, test_size=0.2, random_state=42)

In [15]:
rf_xyz = RandomForestClassifier(random_state=42)
rf_xyz.fit(X_train, y_train['xyz_vaccine'])

In [16]:
y_val_pred_xyz = rf_xyz.predict(X_val)
y_val_pred_proba_xyz = rf_xyz.predict_proba(X_val)[:, 1]

In [17]:
accuracy_xyz = accuracy_score(y_val['xyz_vaccine'], y_val_pred_xyz)
roc_auc_xyz = roc_auc_score(y_val['xyz_vaccine'], y_val_pred_proba_xyz)

print(f"Accuracy for xyz_vaccine: {accuracy_xyz}")
print(f"ROC AUC for xyz_vaccine: {roc_auc_xyz}")

Accuracy for xyz_vaccine: 0.852864095844253
ROC AUC for xyz_vaccine: 0.8614785820538032


In [18]:
rf_seasonal = RandomForestClassifier(random_state=42)
rf_seasonal.fit(X_train, y_train['seasonal_vaccine'])


In [19]:
y_val_pred_seasonal = rf_seasonal.predict(X_val)
y_val_pred_proba_seasonal = rf_seasonal.predict_proba(X_val)[:, 1]

accuracy_seasonal = accuracy_score(y_val['seasonal_vaccine'], y_val_pred_seasonal)
roc_auc_seasonal = roc_auc_score(y_val['seasonal_vaccine'], y_val_pred_proba_seasonal)

print(f"Accuracy for seasonal_vaccine: {accuracy_seasonal}")
print(f"ROC AUC for seasonal_vaccine: {roc_auc_seasonal}")

Accuracy for seasonal_vaccine: 0.7839760389367278
ROC AUC for seasonal_vaccine: 0.8567148204426263


In [20]:
xyz_vaccine_proba = rf_xyz.predict_proba(test_features)[:, 1]
seasonal_vaccine_proba = rf_seasonal.predict_proba(test_features)[:, 1]


In [23]:
submission = pd.DataFrame({
    'respondent_id': test_respondent_id,
    'xyz_vaccine': xyz_vaccine_proba,
    'seasonal_vaccine': seasonal_vaccine_proba})

In [24]:
submission['respondent_id'] = submission['respondent_id'].astype(int)

In [25]:
submission.to_csv('submission.csv', index=False)
