In [1]:
# Data Manipulation
import pandas as pd
import numpy as np

# Preprocessing
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler, OneHotEncoder, Binarizer
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing  import LabelEncoder

# Machine Learning Models
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier, GradientBoostingClassifier
from xgboost.sklearn import XGBClassifier

# Model Selection
from sklearn.model_selection import train_test_split, GridSearchCV

# Metrics
from sklearn.metrics import accuracy_score, classification_report, roc_auc_score,confusion_matrix

# Miscellaneous
import pickle


In [2]:
# Load data
data = pd.read_csv('Dataset\survey.csv')

# Define preprocessing for numeric columns (normalize them so they're on the same scale)
numeric_features = ['Age']
numeric_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='median')),
    ('scaler', StandardScaler()),
    # Add binning here
    ('binarizer', Binarizer(threshold=35))
])

# Define preprocessing for categorical features (encode them)
categorical_features = ['Gender', 'Country', 'state', 'self_employed', 'family_history',
                       'work_interfere', 'no_employees', 'remote_work', 'tech_company', 'benefits',
                       'care_options', 'wellness_program', 'seek_help', 'anonymity', 'leave',
                       'mental_health_consequence', 'phys_health_consequence', 'coworkers',
                       'supervisor', 'mental_health_interview', 'phys_health_interview',
                       'mental_vs_physical', 'obs_consequence']
categorical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='constant', fill_value='missing')),
    ('onehot', OneHotEncoder(handle_unknown='ignore'))])

# Combine preprocessing steps
preprocessor = ColumnTransformer(
    transformers=[
        ('num', numeric_transformer, numeric_features),
        ('cat', categorical_transformer, categorical_features)])

# Add PCA to the pipeline
pipeline = Pipeline(steps=[
    ('preprocessor', preprocessor),
    # Add PCA here
    # ('pca', TruncatedSVD(n_components=20))
])

# Fit the pipeline to the training data
data_transformed = pipeline.fit_transform(data.drop(columns=['treatment'],axis=1))
le = LabelEncoder()
label_transformed = le.fit_transform(data['treatment'])


In [3]:
with open('preprocesor.pkl','wb') as file:
    pickle.dump(pipeline,file)

In [4]:
# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(data_transformed, label_transformed, test_size=0.25, random_state=42)

# Define the hyperparameter search space for each model
hyperparameter_spaces = {
    "LogisticRegression": {
        "C": np.logspace(-5, 5, 10),
        "penalty": ["l2", "none"],  # Change this line
    },
    "DecisionTreeClassifier": {
        "max_depth": [3, 5, 10, None],
        "min_samples_split": [2, 5, 10],
        "min_samples_leaf": [1, 2, 5],
    },
    "KNeighborsClassifier": {
        "n_neighbors": [3, 5, 7, 9],
        "weights": ["uniform", "distance"],
    },
    "RandomForestClassifier": {
        "n_estimators": [100, 200, 500, 1000],
        "max_depth": [3, 5, 10, None],
        "min_samples_split": [2, 5, 10],
        "min_samples_leaf": [1, 2, 5],
    },
    "AdaBoostClassifier": {
        "n_estimators": [100, 200, 500, 1000],
        "learning_rate": [0.1, 0.5, 1.0],
    },
    "GradientBoostingClassifier": {
        "n_estimators": [100, 200, 500, 1000],
        "learning_rate": [0.1, 0.5, 1.0],
        "max_depth": [3, 5, 10, None],
    },
    "XGBClassifier": {
        "n_estimators": [100, 200, 500, 1000],
        "learning_rate": [0.1, 0.5, 1.0],
        "max_depth": [3, 5, 10, None],
    },
}

# Perform hyperparameter tuning for each model
best_model, best_score, best_report = None, 0, None
for model_name, hyperparameter_space in hyperparameter_spaces.items():
    print(f"Evaluating {model_name}...")  # Add this line
    model = eval(model_name)()
    grid_search = GridSearchCV(model, hyperparameter_space, cv=5, scoring="accuracy", n_jobs=-1)
    grid_search.fit(X_train, y_train)
    
    # Print the best score of the current model
    print(f"Best score for {model_name}: {grid_search.best_score_}")  # Add this line
    
    # Evaluate the model on the test set
    y_pred = grid_search.predict(X_test)
    accuracy = accuracy_score(y_test, y_pred)
    classification_rep = classification_report(y_test, y_pred)
    auc_score = roc_auc_score(y_test, y_pred)

    # If this model is better than the previous best_model, update best_model and best_report
    if accuracy > best_score:
        best_model = grid_search.best_estimator_
        best_score = accuracy
        best_report = {
            "accuracy": accuracy,
            "classification_report": classification_rep,
            "auc": auc_score,
            "confusion_matrix": confusion_matrix(y_test, y_pred),
        }

# Print the test results for the best model
print("Best model:", type(best_model).__name__)
print("Best score:", best_score)
print("Best report:", best_report)


Evaluating LogisticRegression...
Best score for LogisticRegression: 0.8410953506698187
Evaluating DecisionTreeClassifier...
Best score for DecisionTreeClassifier: 0.8357930879207475
Evaluating KNeighborsClassifier...
Best score for KNeighborsClassifier: 0.7383372734436564
Evaluating RandomForestClassifier...
Best score for RandomForestClassifier: 0.8389733198243837
Evaluating AdaBoostClassifier...
Best score for AdaBoostClassifier: 0.8326184847461443
Evaluating GradientBoostingClassifier...
Best score for GradientBoostingClassifier: 0.8347348868625464
Evaluating XGBClassifier...
Best score for XGBClassifier: 0.8421479229989869
Best model: DecisionTreeClassifier
Best score: 0.819047619047619
Best report: {'accuracy': 0.819047619047619, 'classification_report': '              precision    recall  f1-score   support\n\n           0       0.93      0.69      0.79       158\n           1       0.75      0.95      0.84       157\n\n    accuracy                           0.82       315\n   ma

In [5]:
# Save the best model to a pickle file
with open('model.pkl', 'wb') as f:
    pickle.dump(best_model, f)

In [8]:
# Load the preprocessor and model from the pickle files
with open('preprocesor.pkl', 'rb') as file:
    preprocessor = pickle.load(file)

with open('model.pkl', 'rb') as file:
    model = pickle.load(file)

def test_predict():
    # Create a new data point
    new_data = pd.DataFrame({
        "Timestamp": ["2023-10-10 12:00:00"],
        "Age": [35],
        "Gender": ["Male"],
        "Country": ["United States"],
        "state": ["CA"],
        "self_employed": [False],
        "family_history": [True],
        "treatment": ["Yes"],
        "work_interfere": ["Sometimes"],
        "no_employees": [26-100],
        "remote_work": [True],
        "tech_company": [True],
        "benefits": [True],
        "care_options": [True],
        "wellness_program": [True],
        "seek_help": [True],
        "anonymity": [True],
        "leave": [True],
        "mental_health_consequence": [True],
        "phys_health_consequence": [True],
        "coworkers": ["Yes"],
        "supervisor": ["Yes"],
        "mental_health_interview": ["Yes"],
        "phys_health_interview": ["Yes"],
        "mental_vs_physical": ["No"],
        "obs_consequence": ["No"]
    })

    # Preprocess the new data
    new_data_transformed = preprocessor.transform(new_data.drop(columns=['treatment'],axis=1))

    # Make a prediction
    prediction = model.predict(new_data_transformed)[0]
    print(prediction)

    # Assert that the prediction is 1 (for treatment)
    assert prediction == 1, f"Expected 1, but got {prediction}"


test_predict()

1
