In [30]:
pip install gcsfs

Collecting gcsfs
  Downloading gcsfs-2024.6.0-py2.py3-none-any.whl.metadata (1.6 kB)
Collecting fsspec==2024.6.0 (from gcsfs)
  Downloading fsspec-2024.6.0-py3-none-any.whl.metadata (11 kB)
Collecting google-cloud-storage (from gcsfs)
  Using cached google_cloud_storage-2.16.0-py2.py3-none-any.whl.metadata (6.1 kB)
Collecting google-cloud-core<3.0dev,>=2.3.0 (from google-cloud-storage->gcsfs)
  Using cached google_cloud_core-2.4.1-py2.py3-none-any.whl.metadata (2.7 kB)
Downloading gcsfs-2024.6.0-py2.py3-none-any.whl (34 kB)
Downloading fsspec-2024.6.0-py3-none-any.whl (176 kB)
   ---------------------------------------- 0.0/176.9 kB ? eta -:--:--
   ---------------------------------------  174.1/176.9 kB 5.3 MB/s eta 0:00:01
   ---------------------------------------- 176.9/176.9 kB 3.5 MB/s eta 0:00:00
Using cached google_cloud_storage-2.16.0-py2.py3-none-any.whl (125 kB)
Using cached google_cloud_core-2.4.1-py2.py3-none-any.whl (29 kB)
Installing collected packages: fsspec, google-cl



In [46]:
import gcsfs
import joblib
from datetime import datetime
from sklearn.model_selection import GridSearchCV
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import f1_score, accuracy_score, precision_score, recall_score, confusion_matrix, classification_report
import warnings

warnings.filterwarnings("ignore")

In [40]:
import gcsfs
import joblib

def load_data_from_gcs(file_paths):
    fs = gcsfs.GCSFileSystem()
    
    data = {}
    for key, file_path in file_paths.items():
        with fs.open(file_path, 'rb') as f:
            data[key] = joblib.load(f)
    
    return data

In [41]:
file_paths = {
    'X_train': 'gs://sepsis-prediction-mlops/data/processed_data/X_train.pkl',
    'X_test': 'gs://sepsis-prediction-mlops/data/processed_data/X_test.pkl',
    'y_train': 'gs://sepsis-prediction-mlops/data/processed_data/y_train.pkl',
    'y_test': 'gs://sepsis-prediction-mlops/data/processed_data/y_test.pkl'
}

data = load_data_from_gcs(file_paths)
X_train = data['X_train']
X_test = data['X_test']
y_train = data['y_train']
y_test = data['y_test']

In [47]:
def get_models_and_parameters():
    models_and_parameters = {
        'RandomForest': {
            'model': RandomForestClassifier(),
            'params': {
                'n_estimators': [50, 100, 200],
                'max_depth': [None, 10, 20],
                'min_samples_split': [2, 5, 10]
            }
        },
        'XGBoost': {
            'model': XGBClassifier(),
            'params': {
                'n_estimators': [50, 100, 200],
                'max_depth': [3, 6, 9],
                'learning_rate': [0.01, 0.1, 0.2]
            }
        },
        'DecisionTree': {
            'model': DecisionTreeClassifier(),
            'params': {
                'max_depth': [None, 10, 20],
                'min_samples_split': [2, 5, 10]
            }
        },
        'LogisticRegression': {
            'model': LogisticRegression(max_iter=200),
            'params': {
                'C': [0.1, 1, 10],
                'solver': ['liblinear', 'lbfgs']
            }
        }
    }
    return models_and_parameters


def train_and_evaluate_models(X_train, y_train, X_test, y_test):
    models_and_parameters = get_models_and_parameters()
    best_models = {}
    
    for model_name, model_info in models_and_parameters.items():
        print(f"Training {model_name}...")
        grid_search = GridSearchCV(estimator=model_info['model'], param_grid=model_info['params'], cv=5, scoring='f1')
        grid_search.fit(X_train, y_train)
        
        best_model = grid_search.best_estimator_
        best_params = grid_search.best_params_
        
        y_pred = best_model.predict(X_test)
        f1 = f1_score(y_test, y_pred)
        
        best_models[model_name] = {
            'model': best_model,
            'params': best_params,
            'f1_score': f1
        }
        
        print(f"{model_name} - Best Params: {best_params}, F1 Score: {f1}")
    
    return best_models


def get_best_model(best_models):
    best_model_name = max(best_models, key=lambda x: best_models[x]['f1_score'])
    best_model_info = best_models[best_model_name]
    
    print(f"Best Model: {best_model_name}")
    print(f"Best Params: {best_model_info['params']}")
    print(f"F1 Score: {best_model_info['f1_score']}")
    
    return best_model_info


def display_classification_metrics(model, X_test, y_test):
    # Predict the labels for the test set
    y_pred = model.predict(X_test)
    
    # Calculate metrics
    accuracy = accuracy_score(y_test, y_pred)
    precision = precision_score(y_test, y_pred, average='weighted')
    recall = recall_score(y_test, y_pred, average='weighted')
    f1 = f1_score(y_test, y_pred, average='weighted')
    conf_matrix = confusion_matrix(y_test, y_pred)
    class_report = classification_report(y_test, y_pred)
    
    # Display the metrics
    print("Accuracy: {:.4f}".format(accuracy))
    print("Precision: {:.4f}".format(precision))
    print("Recall: {:.4f}".format(recall))
    print("F1 Score: {:.4f}".format(f1))
    print("\nConfusion Matrix:\n", conf_matrix)
    print("\nClassification Report:\n", class_report)
    
    # Return the metrics in a dictionary
    metrics = {
        'accuracy': accuracy,
        'precision': precision,
        'recall': recall,
        'f1_score': f1,
        'confusion_matrix': conf_matrix,
        'classification_report': class_report
    }
    
    return metrics


def generate_model_name(model_name_prefix, bucket_name, models_directory):
    fs = gcsfs.GCSFileSystem()
    model_files = fs.glob(f'{bucket_name}/{models_directory}/*')
    
    if not model_files:
        version = 1
    else:
        versions = [int(file.split('_v')[-1].split('.')[0]) for file in model_files if file.startswith(f'{bucket_name}/{models_directory}/{model_name_prefix}')]
        version = max(versions) + 1 if versions else 1
    
    timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
    model_name = f"{model_name_prefix}_{timestamp}_v{version}.pkl"
    
    return f"{bucket_name}/{models_directory}/{model_name}"

def save_model(model, file_path):
    with open(file_path, 'wb') as f:
        joblib.dump(model, f)
    print(f"Model saved to {file_path}")

In [50]:
X_train.shape

(585311, 17)

In [51]:
y_train.shape

(698474,)

In [48]:
# Train and evaluate models
best_models = train_and_evaluate_models(X_train, y_train, X_test, y_test)

# Get the best model
best_model_info = get_best_model(best_models)
best_model = best_model_info['model']
model_name_prefix = list(best_models.keys())[list(best_models.values()).index(best_model_info)]
best_model_metrics = display_classification_metrics(best_model, X_test, y_test)


Training RandomForest...


ValueError: Found input variables with inconsistent numbers of samples: [585311, 698474]

In [None]:
bucket_name = 'gs://sepsis-prediction-mlops'
models_directory = 'artifacts/models'
model_name = generate_model_name(model_name_prefix, bucket_name, models_directory)
save_model(best_model, model_name)