In [19]:
import pandas as pd
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import GaussianNB
from sklearn import svm
from sklearn.metrics import roc_auc_score

In [20]:
train_features = pd.read_csv('training_set_features.csv')
train_labels = pd.read_csv('training_set_labels.csv')
test_features = pd.read_csv('test_set_features.csv')


train_data = pd.merge(train_features, train_labels, on='respondent_id')

In [21]:
X_train = train_data.drop(columns=['respondent_id', 'xyz_vaccine', 'seasonal_vaccine'])
y_train_xyz = train_data['xyz_vaccine']
y_train_s = train_data['seasonal_vaccine']

X_test = test_features.drop(columns=['respondent_id'])

In [22]:
imputer = SimpleImputer(strategy='most_frequent')
X_i = imputer.fit_transform(X_train)
X_test_i = imputer.transform(X_test)
X_i = pd.DataFrame(X_i, columns=X_train.columns)
X_test_i = pd.DataFrame(X_test_i, columns=X_test.columns)

In [23]:
headings = {}
for column in X_train.columns:
    if X_train[column].dtype == 'object':
        le = LabelEncoder()
        X_i[column] = le.fit_transform(X_i[column])
        X_test_i[column] = le.transform(X_test_i[column])
        headings[column] = le

In [24]:
scaler = StandardScaler()
X_train = scaler.fit_transform(X_i)
X_test = scaler.transform(X_test_i)

In [25]:
lr_xyz = LogisticRegression(max_iter=1000)
lr_xyz.fit(X_i, y_train_xyz)
lr_s = LogisticRegression(max_iter=1000)
lr_s.fit(X_i, y_train_s)
roc_auc_xyz_lr = roc_auc_score(y_train_xyz, lr_xyz.predict_proba(X_i)[:, 1],average='macro')
roc_auc_seasonal_lr = roc_auc_score(y_train_s, lr_s.predict_proba(X_i)[:, 1],average='macro')
print(f"ROC AUC in Logistic ",round((roc_auc_xyz_lr+roc_auc_seasonal_lr )/2,2))
xyz_vaccine_lr = lr_xyz.predict_proba(X_test_i)[:, 1] 
seasonal_vaccine_lr = lr_s.predict_proba(X_test_i)[:, 1]

ROC AUC in Logistic  0.84


In [26]:
gnb_xyz = GaussianNB()
gnb_xyz.fit(X_i, y_train_xyz)
gnb_s = GaussianNB()
gnb_s.fit(X_i, y_train_s)
roc_auc_xyz_gnb = roc_auc_score(y_train_xyz, gnb_xyz.predict_proba(X_i)[:, 1])
roc_auc_seasonal_gnb = roc_auc_score(y_train_s, gnb_s.predict_proba(X_i)[:, 1]) 
print(f"ROC AUC in Guassian Naive Bias is ",round((roc_auc_xyz_gnb+roc_auc_seasonal_gnb)/2,2))

xyz_vaccine_gnb = gnb_xyz.predict_proba(X_test_i)[:, 1]
seasonal_vaccine_gnb = gnb_s.predict_proba(X_test_i)[:, 1]

ROC AUC in Guassian Naive Bias is  0.79


In [27]:
test_ids = test_features['respondent_id']
submission = pd.DataFrame({
    'respondent_id': test_ids,
    'xyz_vaccine_lr':xyz_vaccine_lr ,
    'seasonal_vaccine_lr': seasonal_vaccine_lr,
    'xyz_vaccine_gnb':xyz_vaccine_gnb ,
    'seasonal_vaccine_gnb':seasonal_vaccine_gnb ,
    
})

submission['xyz_vaccine'] =round(submission['xyz_vaccine_lr'] ,2)
submission['seasonal_vaccine'] = round(submission['seasonal_vaccine_lr'],2)

submission = submission[['respondent_id', 'xyz_vaccine', 'seasonal_vaccine']]

submission.to_csv('final_submission.csv', index=False)

print("Submission file 'final_submission.csv' created successfully.")


Submission file 'final_submission.csv' created successfully.
