In [None]:
import pandas as pd
from sklearn.preprocessing import LabelEncoder, OneHotEncoder
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler

# Load the dataset
df = pd.read_csv('hospital_readmissions.csv')

# Handle missing values
# For categorical variables, replace missing with 'Unknown'
categorical_cols = ['glucose_test', 'A1Ctest', 'change', 'diabetes_med']
for col in categorical_cols:
    df[col].fillna('Unknown', inplace=True)

# For numerical columns, replace missing values with the median
numerical_cols = ['time_in_hospital', 'n_procedures', 'n_lab_procedures', 'n_medications',
                  'n_outpatient', 'n_inpatient', 'n_emergency']
imputer = SimpleImputer(strategy='median')
df[numerical_cols] = imputer.fit_transform(df[numerical_cols])

# Convert categorical columns to numerical values using Label Encoding or One-Hot Encoding
# Label Encoding for binary columns (e.g., 'yes'/'no' columns)
le = LabelEncoder()
df['diabetes_med'] = le.fit_transform(df['diabetes_med'])
df['change'] = le.fit_transform(df['change'])

# One-Hot Encoding for multi-category variables (e.g., 'age', 'medical_specialty', 'diag_1')
df = pd.get_dummies(df, columns=['age', 'medical_specialty', 'diag_1'], drop_first=True)

# Feature Scaling (Standardization)
scaler = StandardScaler()
df[numerical_cols] = scaler.fit_transform(df[numerical_cols])

# Check the first few rows after preprocessing
print(df.head())

# Save the preprocessed data to a new CSV file or directly upload to BigQuery
df.to_csv('hospital_readmissions_preprocessed.csv', index=False)
