In [1]:
# Data Manipulation
import pandas as pd
import numpy as np

# logging
import logging
from datetime import datetime
from time import time


# Preprocessing
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler, OneHotEncoder, Binarizer
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing  import LabelEncoder

# Machine Learning Models
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier, GradientBoostingClassifier
from xgboost.sklearn import XGBClassifier

# Model Selection
from sklearn.model_selection import train_test_split, GridSearchCV

# Metrics
from sklearn.metrics import accuracy_score, classification_report, roc_auc_score,confusion_matrix

# Miscellaneous
import pickle


In [2]:
# Load data
data = pd.read_csv('..\Dataset\processed.csv')

# Define preprocessing for numeric columns (normalize them so they're on the same scale)
numeric_features = ['Age']
numeric_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='median')),
    ('scaler', StandardScaler()),
    # Add binning here
    ('binarizer', Binarizer(threshold=35))
])

# Define preprocessing for categorical features (encode them)
categorical_features = ['Gender', 'Country', 'state', 'self_employed', 'family_history',
                       'work_interfere', 'no_employees', 'remote_work', 'tech_company', 'benefits',
                       'care_options', 'wellness_program', 'seek_help', 'anonymity', 'leave',
                       'mental_health_consequence', 'phys_health_consequence', 'coworkers',
                       'supervisor', 'mental_health_interview', 'phys_health_interview',
                       'mental_vs_physical', 'obs_consequence']
categorical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='constant', fill_value='missing')),
    ('onehot', OneHotEncoder(handle_unknown='ignore'))])

# Combine preprocessing steps
preprocessor = ColumnTransformer(
    transformers=[
        ('num', numeric_transformer, numeric_features),
        ('cat', categorical_transformer, categorical_features)])

# Add PCA to the pipeline
pipeline = Pipeline(steps=[
    ('preprocessor', preprocessor),
    # Add PCA here
    # ('pca', TruncatedSVD(n_components=20))
])

# Fit the pipeline to the training data
data_transformed = pipeline.fit_transform(data.drop(columns=['treatment'],axis=1))
le = LabelEncoder()
label_transformed = le.fit_transform(data['treatment'])


In [3]:
with open('../models/preprocessed.pkl', 'wb') as file:
    pickle.dump(pipeline, file)

In [4]:
# Setup logging
logging.basicConfig(
    format='[%(asctime)s] - %(levelname)s - %(message)s',
    level=logging.INFO,
    handlers=[
        logging.FileHandler("model_training.log"),
        logging.StreamHandler()
    ]
)
# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(data_transformed, label_transformed, test_size=0.25, random_state=42)

# Define the hyperparameter search space for each model
hyperparameter_spaces = {
    "LogisticRegression": {
        "C": np.logspace(-5, 5, 10),
        "penalty": ["l2"],  
    },
    "DecisionTreeClassifier": {
        "max_depth": [3, 5, 10, None],
        "min_samples_split": [2, 5, 10],
        "min_samples_leaf": [1, 2, 5],
    },
    "KNeighborsClassifier": {
        "n_neighbors": [3, 5, 7, 9],
        "weights": ["uniform", "distance"],
    },
    "RandomForestClassifier": {
        "n_estimators": [100, 200, 500, 1000],
        "max_depth": [3, 5, 10, None],
        "min_samples_split": [2, 5, 10],
        "min_samples_leaf": [1, 2, 5],
    },
    "AdaBoostClassifier": {
        "n_estimators": [100, 200, 500, 1000],
        "learning_rate": [0.1, 0.5, 1.0],
    },
    "GradientBoostingClassifier": {
        "n_estimators": [100, 200, 500, 1000],
        "learning_rate": [0.1, 0.5, 1.0],
        "max_depth": [3, 5, 10, None],
    },
    "XGBClassifier": {
        "n_estimators": [100, 200, 500, 1000],
        "learning_rate": [0.1, 0.5, 1.0],
        "max_depth": [3, 5, 10, None],
    },
}

# Perform hyperparameter tuning for each model
best_model, best_score, best_report = None, 0, None
for model_name, hyperparameter_space in hyperparameter_spaces.items():
    logging.info(f"Evaluating {model_name}...")
    model = eval(model_name)()
    grid_search = GridSearchCV(model, hyperparameter_space, cv=5, scoring="accuracy", n_jobs=-1, error_score='raise')
    
    start_time = time()
    try:
        grid_search.fit(X_train, y_train)
        elapsed_time = time() - start_time
        
        best_score_for_model = grid_search.best_score_
        logging.info(f"Best score for {model_name}: {best_score_for_model}")
        logging.info(f"Time elapsed for {model_name}: {elapsed_time:.2f} seconds")

        # Evaluate the model on the test set
        y_pred = grid_search.predict(X_test)
        accuracy = accuracy_score(y_test, y_pred)
        classification_rep = classification_report(y_test, y_pred)
        auc_score = roc_auc_score(y_test, y_pred)
        
        logging.info(f"Accuracy for {model_name}: {accuracy}")
        logging.info(f"Classification Report for {model_name}:\n{classification_rep}")
        logging.info(f"AUC Score for {model_name}: {auc_score}")

        # If this model is better than the previous best_model, update best_model and best_report
        if accuracy > best_score:
            best_model = grid_search.best_estimator_
            best_score = accuracy
            best_report = {
                "accuracy": accuracy,
                "classification_report": classification_rep,
                "auc": auc_score,
                "confusion_matrix": confusion_matrix(y_test, y_pred),
            }

    except Exception as e:
        elapsed_time = time() - start_time
        logging.error(f"Error with {model_name} after {elapsed_time:.2f} seconds: {e}")

# Print the test results for the best model
logging.info("Best model: %s", type(best_model).__name__)
logging.info("Best score: %f", best_score)
logging.info("Best report: %s", best_report)

[2024-07-26 18:07:40,103] - INFO - Evaluating LogisticRegression...
[2024-07-26 18:07:42,270] - INFO - Best score for LogisticRegression: 0.8400315208825846
[2024-07-26 18:07:42,271] - INFO - Time elapsed for LogisticRegression: 2.17 seconds
[2024-07-26 18:07:42,278] - INFO - Accuracy for LogisticRegression: 0.8126984126984127
[2024-07-26 18:07:42,279] - INFO - Classification Report for LogisticRegression:
              precision    recall  f1-score   support

           0       0.86      0.75      0.80       158
           1       0.78      0.87      0.82       157

    accuracy                           0.81       315
   macro avg       0.82      0.81      0.81       315
weighted avg       0.82      0.81      0.81       315

[2024-07-26 18:07:42,279] - INFO - AUC Score for LogisticRegression: 0.8128880109650891
[2024-07-26 18:07:42,281] - INFO - Evaluating DecisionTreeClassifier...
[2024-07-26 18:07:42,546] - INFO - Best score for DecisionTreeClassifier: 0.8357930879207475
[2024-07-2

In [5]:
# Save the best model to a pickle file
with open('../models/model.pkl', 'wb') as f:
    pickle.dump(best_model, f)

In [6]:
# Load the preprocessor and model from the pickle files
with open('../models/preprocessed.pkl', 'rb') as file:
    preprocessor = pickle.load(file)

with open('../models/model.pkl', 'rb') as file:
    model = pickle.load(file)

def test_predict():
    # Create a new data point
    new_data = pd.DataFrame({
        "Timestamp": ["2023-10-10 12:00:00"],
        "Age": [35],
        "Gender": ["Male"],
        "Country": ["United States"],
        "state": ["CA"],
        "self_employed": [False],
        "family_history": [True],
        "treatment": ["Yes"],
        "work_interfere": ["Sometimes"],
        "no_employees": [26-100],
        "remote_work": [True],
        "tech_company": [True],
        "benefits": [True],
        "care_options": [True],
        "wellness_program": [True],
        "seek_help": [True],
        "anonymity": [True],
        "leave": [True],
        "mental_health_consequence": [True],
        "phys_health_consequence": [True],
        "coworkers": ["Yes"],
        "supervisor": ["Yes"],
        "mental_health_interview": ["Yes"],
        "phys_health_interview": ["Yes"],
        "mental_vs_physical": ["No"],
        "obs_consequence": ["No"]
    })

    # Preprocess the new data
    new_data_transformed = preprocessor.transform(new_data.drop(columns=['treatment'],axis=1))

    # Make a prediction
    prediction = model.predict(new_data_transformed)[0]
    print(prediction)

    # Assert that the prediction is 1 (for treatment)
    assert prediction == 1, f"Expected 1, but got {prediction}"


test_predict()

1
