In [1]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.multioutput import MultiOutputClassifier
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import classification_report, roc_auc_score


In [2]:
# Load your data from CSV files
X_train = pd.read_csv('C:\\Users\\vedan\\Downloads\\dataset and all\\training_set_features.csv')
Y_train = pd.read_csv('C:\\Users\\vedan\\Downloads\\dataset and all\\training_set_labels.csv')


In [3]:
# Drop specified columns
columns_to_drop = ['age_group', 'education', 'race', 'sex', 'income_poverty',
                   'marital_status', 'rent_or_own', 'employment_status', 'hhs_geo_region',
                   'census_msa', 'employment_industry', 'employment_occupation']
X_train = X_train.drop(columns=columns_to_drop, axis=1)


In [4]:
# Combine X_train and Y_train to drop NaN values simultaneously
combined = pd.concat([X_train, Y_train], axis=1)

# Drop rows with any NaN values
combined = combined.dropna()

# Split the combined DataFrame back into X_train and Y_train
X_train = combined.iloc[:, :-2]
Y_train = combined.iloc[:, -2:]


In [5]:
# Replace infinite values with NaN and then handle them
X_train.replace([np.inf, -np.inf], np.nan, inplace=True)
X_train = X_train.fillna(X_train.mean())

# Scale the data to handle extremely large values
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)


In [6]:
# Ensure both DataFrames have the same number of rows
print(X_train.shape)
print(Y_train.shape)



(13506, 25)
(13506, 2)


In [7]:
# Initialize logistic regression model
logreg = LogisticRegression(solver='lbfgs', max_iter=1000)

# Use MultiOutputClassifier for multilabel classification
clf = MultiOutputClassifier(logreg)

# Train the model
clf.fit(X_train, Y_train)

# Predict on the training set itself (for demonstration purposes)
y_pred = clf.predict(X_train)

# Print classification report
print(classification_report(Y_train, y_pred))

# Predict probabilities for ROC AUC
y_prob = clf.predict_proba(X_train)

# Calculate ROC AUC score for each target
roc_auc_xyz = roc_auc_score(Y_train.iloc[:, 0], y_prob[0][:, 1])
roc_auc_seasonal = roc_auc_score(Y_train.iloc[:, 1], y_prob[1][:, 1])
mean_roc_auc = (roc_auc_xyz + roc_auc_seasonal) / 2

print(f'ROC AUC for xyz_vaccine: {roc_auc_xyz}')
print(f'ROC AUC for seasonal_vaccine: {roc_auc_seasonal}')
print(f'Mean ROC AUC: {mean_roc_auc}')


              precision    recall  f1-score   support

           0       0.75      0.58      0.65      4031
           1       0.79      0.77      0.78      6802

   micro avg       0.78      0.70      0.74     10833
   macro avg       0.77      0.68      0.72     10833
weighted avg       0.77      0.70      0.73     10833
 samples avg       0.40      0.38      0.38     10833

ROC AUC for xyz_vaccine: 0.8535283740981012
ROC AUC for seasonal_vaccine: 0.8570731556912575
Mean ROC AUC: 0.8553007648946793


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
