In [11]:
import pandas as pd
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import roc_auc_score
from sklearn.preprocessing import LabelEncoder

# Load data
train_df = pd.read_csv('/kaggle/input/c-and-a-submission/training_set_features.csv')
labels_df = pd.read_csv('/kaggle/input/c-and-a-submission/training_set_labels.csv')
test_df = pd.read_csv('/kaggle/input/c-and-a-submission/test_set_features.csv')

# Merge features and labels
train_df = train_df.merge(labels_df, on='respondent_id')

# Features and target variables
X = train_df.drop(columns=['respondent_id', 'xyz_vaccine', 'seasonal_vaccine'])
y = train_df[['xyz_vaccine', 'seasonal_vaccine']]

# Fill missing values with the most frequent value for simplicity
X = X.apply(lambda x: x.fillna(x.value_counts().index[0]))
test_df = test_df.apply(lambda x: x.fillna(x.value_counts().index[0]))

# Identify categorical columns
categorical_columns = X.select_dtypes(include=['object']).columns

# Encode categorical columns using LabelEncoder
label_encoders = {}
for col in categorical_columns:
    le = LabelEncoder()
    X[col] = le.fit_transform(X[col].astype(str))
    test_df[col] = le.fit_transform(test_df[col].astype(str))
    label_encoders[col] = le

# Split data into training and validation sets
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)

# Define the model with hyperparameters to tune
model = LogisticRegression(solver='liblinear', max_iter=1000)

# Set up the hyperparameter grid
param_grid = {
    'penalty': ['l1', 'l2'],
    'C': [0.01, 0.1, 1, 10, 100]
}

# Grid search with cross-validation
grid_search_xyz = GridSearchCV(model, param_grid, cv=5, scoring='roc_auc')
grid_search_seasonal = GridSearchCV(model, param_grid, cv=5, scoring='roc_auc')

# Train the models
grid_search_xyz.fit(X_train, y_train['xyz_vaccine'])
grid_search_seasonal.fit(X_train, y_train['seasonal_vaccine'])

# Best parameters
best_params_xyz = grid_search_xyz.best_params_
best_params_seasonal = grid_search_seasonal.best_params_

print(f'Best parameters for xyz_vaccine: {best_params_xyz}')
print(f'Best parameters for seasonal_vaccine: {best_params_seasonal}')

# Predict probabilities for validation set
xyz_val_pred_proba = grid_search_xyz.predict_proba(X_val)[:, 1]
seasonal_val_pred_proba = grid_search_seasonal.predict_proba(X_val)[:, 1]

# Calculate ROC AUC scores
xyz_auc = roc_auc_score(y_val['xyz_vaccine'], xyz_val_pred_proba)
seasonal_auc = roc_auc_score(y_val['seasonal_vaccine'], seasonal_val_pred_proba)

print(f'ROC AUC for xyz_vaccine: {xyz_auc:.4f}')
print(f'ROC AUC for seasonal_vaccine: {seasonal_auc:.4f}')

# Predict probabilities for test set
X_test = test_df.drop(columns=['respondent_id'])
xyz_test_pred_proba = grid_search_xyz.predict_proba(X_test)[:, 1]
seasonal_test_pred_proba = grid_search_seasonal.predict_proba(X_test)[:, 1]

# Create submission DataFrame
submission_df = pd.DataFrame({
    'respondent_id': test_df['respondent_id'],
    'xyz_vaccine': xyz_test_pred_proba,
    'seasonal_vaccine': seasonal_test_pred_proba
})

# Save to CSV
submission_df.to_csv('submission.csv', index=False)



Best parameters for xyz_vaccine: {'C': 10, 'penalty': 'l1'}
Best parameters for seasonal_vaccine: {'C': 1, 'penalty': 'l1'}
ROC AUC for xyz_vaccine: 0.8277
ROC AUC for seasonal_vaccine: 0.8502
