In [10]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import roc_auc_score

# Load the datasets
train_features = pd.read_csv(r"C:\Users\Nirmiti\training_set_features.csv")
train_labels = pd.read_csv(r"C:\Users\Nirmiti\training_set_labels.csv")
test_features = pd.read_csv(r"C:\Users\Nirmiti\test_set_features.csv")
submission_format = pd.read_csv(r"C:\Users\Nirmiti\submission_format.csv")

# Calculate the percentage of missing values for each column
missing_data_percentage = train_features.isnull().mean() * 100

# Set a threshold for missing data (e.g., 40%)
threshold = 40

# Identify columns that exceed the threshold
columns_to_drop = missing_data_percentage[missing_data_percentage > threshold].index

# Display the columns that have more than 40% missing data
print(f"Columns with more than {threshold}% missing data:")
print(missing_data_percentage[missing_data_percentage > threshold])

# Drop the columns from both train and test datasets
train_features = train_features.drop(columns=columns_to_drop)
test_features = test_features.drop(columns=columns_to_drop)

# Prepare the data
X = train_features.drop(columns=['respondent_id'])
y_xyz = train_labels['xyz_vaccine']
y_seasonal = train_labels['seasonal_vaccine']

# Recalculate numeric and non-numeric columns after dropping
numeric_cols = X.select_dtypes(include=['number']).columns
non_numeric_cols = X.select_dtypes(exclude=['number']).columns

# Fill missing values in numeric columns with median
X[numeric_cols] = X[numeric_cols].fillna(X[numeric_cols].median())
test_features[numeric_cols] = test_features[numeric_cols].fillna(test_features[numeric_cols].median())

# Fill missing values in non-numeric columns with mode
X[non_numeric_cols] = X[non_numeric_cols].fillna(X[non_numeric_cols].mode().iloc[0])
test_features[non_numeric_cols] = test_features[non_numeric_cols].fillna(test_features[non_numeric_cols].mode().iloc[0])

# Encode categorical variables
X = pd.get_dummies(X)
test_features = pd.get_dummies(test_features)

# Align columns in test_features to match training features
test_features = test_features.reindex(columns=X.columns, fill_value=0)

# Split the data for validation purposes
X_train_xyz, X_val_xyz, y_train_xyz, y_val_xyz = train_test_split(X, y_xyz, test_size=0.2, random_state=42)
X_train_seasonal, X_val_seasonal, y_train_seasonal, y_val_seasonal = train_test_split(X, y_seasonal, test_size=0.2, random_state=42)

# Standardize the features
scaler = StandardScaler()

# Fit the scaler on the training data and transform
X_train_xyz = pd.DataFrame(scaler.fit_transform(X_train_xyz), columns=X_train_xyz.columns)
X_val_xyz = pd.DataFrame(scaler.transform(X_val_xyz), columns=X_val_xyz.columns)
X_train_seasonal = pd.DataFrame(scaler.fit_transform(X_train_seasonal), columns=X_train_seasonal.columns)
X_val_seasonal = pd.DataFrame(scaler.transform(X_val_seasonal), columns=X_val_seasonal.columns)
test_features_scaled = pd.DataFrame(scaler.transform(test_features), columns=test_features.columns)

# Train logistic regression models
model_xyz = LogisticRegression(max_iter=1000)
model_seasonal = LogisticRegression(max_iter=1000)

model_xyz.fit(X_train_xyz, y_train_xyz)
model_seasonal.fit(X_train_seasonal, y_train_seasonal)

# Predict on the training set
y_train_pred_xyz = model_xyz.predict_proba(X_train_xyz)[:, 1]
y_train_pred_seasonal = model_seasonal.predict_proba(X_train_seasonal)[:, 1]

# Predict on the validation set
y_val_pred_xyz = model_xyz.predict_proba(X_val_xyz)[:, 1]
y_val_pred_seasonal = model_seasonal.predict_proba(X_val_seasonal)[:, 1]
model_xyz.score(X_val_xyz,y_train_xyz)
# Calculate ROC AUC scores for the training set
roc_auc_train_xyz = roc_auc_score(y_train_xyz, y_train_pred_xyz)
roc_auc_train_seasonal = roc_auc_score(y_train_seasonal, y_train_pred_seasonal)

# Calculate ROC AUC scores for the validation set
roc_auc_val_xyz = roc_auc_score(y_val_xyz, y_val_pred_xyz)
roc_auc_val_seasonal = roc_auc_score(y_val_seasonal, y_val_pred_seasonal)

# Predict on the test set
test_pred_xyz = model_xyz.predict_proba(test_features_scaled)[:, 1]
test_pred_seasonal = model_seasonal.predict_proba(test_features_scaled)[:, 1]

# Prepare the submission file
submission_format['xyz_vaccine'] = test_pred_xyz
submission_format['seasonal_vaccine'] = test_pred_seasonal

submission_file_path = r"C:\Users\Nirmiti\submission_format.csv"
submission_format.to_csv(submission_file_path, index=False)

roc_auc_train_xyz, roc_auc_train_seasonal, roc_auc_val_xyz, roc_auc_val_seasonal, submission_file_path


Columns with more than 40% missing data:
Series([], dtype: float64)


(0.8326850538311015,
 0.8497942283613716,
 0.8266982662262898,
 0.8514784624718506,
 'C:\\Users\\Nirmiti\\submission_format.csv')