In [1]:
import pandas as pd
import numpy as np
from sklearn.impute import SimpleImputer
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.svm import SVC
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score
from sklearn.ensemble import VotingClassifier
import pickle

path = './survey.csv'
data_file = pd.read_csv(path)

dropped_columns = ['comments', 'Timestamp', 'state', 'self_employed', 'work_interfere', 'Country']
data_file.drop(dropped_columns, axis=1, inplace=True)
data_file.dropna(inplace=True)

Pyarrow will become a required dependency of pandas in the next major release of pandas (pandas 3.0),
(to allow more performant data types, such as the Arrow string type, and better interoperability with other libraries)
but was not found to be installed on your system.
If this would cause problems for you,
please provide us feedback at https://github.com/pandas-dev/pandas/issues/54466
        
  import pandas as pd


KeyError: "['Timestamp', 'state', 'self_employed', 'work_interfere', 'Country'] not found in axis"

In [None]:
path = './survey.csv'
data_file = pd.read_csv(path)
data_file.head()

In [None]:
data_file = data_file.drop(['comments', 'Timestamp', 'state', 'self_employed', 'work_interfere', 'Country'], axis=1)

In [None]:
dropped_columns = ['comments', 'Timestamp', 'state', 'self_employed', 'work_interfere', 'Country']
# data_file.drop(dropped_columns, axis=1, inplace=True)
data_file.dropna(inplace=True)

In [None]:
# Process gender column
def process_gender(gender):
    gender = gender.lower()
    if 'female' not in gender and ('male' in gender or gender == 'm'):
        return 'male'
    elif 'female' in gender or gender == 'f':
        return 'female'
    else:
        return 'others'

data_file['Gender'] = data_file['Gender'].apply(process_gender)

# Filter age column
data_file = data_file[(data_file['Age'] > 16) & (data_file['Age'] < 100)]


In [None]:
# Define features and target
y = data_file['treatment'].replace(['Yes', 'No'], [1, 0])
X = data_file.drop(columns=['treatment'])


In [None]:
# Define numeric and categorical features
numeric_features = ['Age']
categorical_features = ['Gender', 'family_history', 'no_employees',
                        'remote_work', 'tech_company', 'benefits', 'care_options',
                        'wellness_program', 'seek_help', 'anonymity', 'leave',
                        'mental_health_consequence', 'phys_health_consequence', 'coworkers',
                        'supervisor', 'mental_health_interview', 'phys_health_interview',
                        'mental_vs_physical', 'obs_consequence']

In [None]:
preprocessor = ColumnTransformer(
    transformers=[
        ('num', StandardScaler(), numeric_features),
        ('cat', OneHotEncoder(drop='first'), categorical_features)  # drop='first' to handle multicollinearity
    ])

In [None]:

X_preprocessed = preprocessor.fit_transform(X)

# Get the feature names after one-hot encoding
feature_names = numeric_features + list(preprocessor.named_transformers_['cat'].get_feature_names_out(categorical_features))

# Convert the transformed data back to a DataFrame
X_preprocessed_df = pd.DataFrame(X_preprocessed, columns=feature_names)

# Display the first few rows of the preprocessed DataFrame
print(X_preprocessed_df.head())

#save the preprocessor
with open('preprocessor.pkl', 'wb') as f:
    pickle.dump(preprocessor, f)

In [None]:
# Split data into train and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


In [None]:
# Define models and parameter grids
models = {
    'RandomForest': RandomForestClassifier(),
    'AdaBoost': AdaBoostClassifier(),
    'MLPClassifier': MLPClassifier(),
    'SVC': SVC()
}

param_grids = {
    'RandomForest': {
        'classifier__n_estimators': [100, 200, 300],
        'classifier__max_depth': [None, 10, 20, 30],
        'classifier__min_samples_split': [2, 5, 10]
    },
    'AdaBoost': {
        'classifier__n_estimators': [50, 100, 200],
        'classifier__learning_rate': [0.01, 0.1, 1]
    },
    'MLPClassifier': {
        'classifier__hidden_layer_sizes': [(50,50,50), (50,100,50), (100,)],
        'classifier__activation': ['tanh', 'relu'],
        'classifier__solver': ['sgd', 'adam'],
        'classifier__alpha': [0.0001, 0.05],
        'classifier__learning_rate': ['constant','adaptive']
    },
    'SVC': {
        'classifier__C': [0.1, 1, 10, 100],
        'classifier__gamma': [1, 0.1, 0.01, 0.001],
        'classifier__kernel': ['rbf', 'poly', 'sigmoid']
    }
}

In [None]:
# import pickle

# best_models = {}

# for model_name, model in models.items():
#     pipeline = Pipeline(steps=[
#         ('preprocessor', preprocessor),
#         ('classifier', model)
#     ])
    
#     grid_search = GridSearchCV(pipeline, param_grids[model_name], cv=5, verbose=3, n_jobs=-1)
#     grid_search.fit(X_train, y_train)
    
#     best_models[model_name] = grid_search.best_estimator_
    
#     # Save the model with the highest accuracy
#     if model_name == max(best_models, key=lambda k: grid_search.best_score_):
#         with open(f'{model_name}_best_model.pkl', 'wb') as f:
#             pickle.dump(best_models[model_name], f)

#     print(f"Best parameters for {model_name}: {grid_search.best_params_}")
#     print(f"Best cross-validation accuracy for {model_name}: {grid_search.best_score_}\n")

# # Optionally, load the best model later
# # with open(f'{model_name}_best_model.pkl', 'rb') as f:
# #     loaded_model = pickle.load(f)

In [None]:
for model_name, model in best_models.items():
    y_pred = model.predict(X_test)
    print(f'{model_name} results:')
    print(classification_report(y_test, y_pred))
    print(f'Accuracy: {accuracy_score(y_test, y_pred)}')
    print('Confusion Matrix:')
    print(confusion_matrix(y_test, y_pred))
    print('\n')

In [None]:
pipeline_rf = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('classifier', RandomForestClassifier())
])

pipeline_ab = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('classifier', AdaBoostClassifier())
])

pipeline_mlp = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('classifier', MLPClassifier())
])

pipeline_svc = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('classifier', SVC())
])

In [None]:

# Create the VotingClassifier with pipelines
voting_clf = VotingClassifier(
    estimators=[
        ('rf', pipeline_rf),
        ('ab', pipeline_ab),
        ('mlp', pipeline_mlp),
        ('svc', pipeline_svc)
    ],
    voting='hard'  # Use 'soft' for probability voting
)

# Fit the VotingClassifier
voting_clf.fit(X_train, y_train)

# Evaluate the VotingClassifier
y_pred_voting = voting_clf.predict(X_test)
print("Ensemble Voting Classifier:")
print(f"Classification report:\n{classification_report(y_test, y_pred_voting)}")
print(f"Confusion matrix:\n{confusion_matrix(y_test, y_pred_voting)}")
print(f"Accuracy: {accuracy_score(y_test, y_pred_voting)}\n")

# Save the VotingClassifier
with open('voting_classifier.pkl', 'wb') as f:
    pickle.dump(voting_clf, f)

In [None]:
from sklearn.ensemble import StackingClassifier

# Define the meta-model (final estimator)
meta_model = RandomForestClassifier(n_estimators=100, random_state=42)

# Create the StackingClassifier with pipelines
stacking_clf = StackingClassifier(
    estimators=[
        ('rf', pipeline_rf),
        ('ab', pipeline_ab),
        ('mlp', pipeline_mlp),
        ('svc', pipeline_svc)
    ],
    final_estimator=meta_model,
    cv=5
)

# Fit the StackingClassifier
stacking_clf.fit(X_train, y_train)

# Evaluate the StackingClassifier
y_pred_stacking = stacking_clf.predict(X_test)
print("Stacking Classifier:")
print(f"Classification report:\n{classification_report(y_test, y_pred_stacking)}")
print(f"Confusion matrix:\n{confusion_matrix(y_test, y_pred_stacking)}")
print(f"Accuracy: {accuracy_score(y_test, y_pred_stacking)}\n")

# Save the StackingClassifier
with open('stacking_classifier.pkl', 'wb') as f:
    pickle.dump(stacking_clf, f)
    

In [None]:
y_pred_stacking