In [None]:
import numpy as np
import pandas as pd
import sklearn.model_selection
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.svm import SVC
from sklearn.metrics import precision_score, recall_score, f1_score, balanced_accuracy_score, fbeta_score
import pickle

## Load the Dataset

In [None]:
dataset = pd.read_csv('descriptorsGSK3.csv')

## Split the Dataset

In [None]:
dataset = dataset.drop('stripped_smiles', axis = 1)
X = dataset.drop('gsk3', axis = 1)
y = dataset['gsk3']
# Split the dataset into random train and test subsets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.30, random_state = 0, stratify = y)

## Imputation of missing values

In [None]:
# Complete NaN values in each column with the median
X_train = X_train.fillna(X_train.median())
X_test = X_test.fillna(dataset.median())

## Data Standarization

In [None]:
# Create an instance of StandardScaler
scaler = StandardScaler()

# Fit the scaler to the data and transform it
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

## Random Forest Model


In [None]:
n_estimators = [10, 50, 70, 100, 150, 200]
max_depth = [None, 5, 7, 10, 13, 15]
# Create a dataframe to save the metrics for each set of parameters
df_RF = pd.DataFrame(columns = ['n_estimators', 'max_depth', 'sensitivity', 'specificity', 'precision', 'f1_score', 'balanced_accuracy', 'fbeta_score'])
for n in n_estimators:
  for d in max_depth:
    # Define the random forest classifier with its parameters
    model = RandomForestClassifier(n_estimators = n, max_depth = d, n_jobs = 10, random_state = 0, class_weight = "balanced")
    # Train the model with the train data
    model.fit(X_train, y_train)
    # The model make predictions for the test data
    y_pred = model.predict(X_test)

    # Calculate the performance metrics
    sensitivity = recall_score(y_test, y_pred)
    specificity = recall_score(y_test, y_pred, pos_label = 0)
    precision = precision_score(y_test, y_pred)
    f1_score = sklearn.metrics.f1_score(y_test, y_pred)
    balanced_accuracy = balanced_accuracy_score(y_test, y_pred)
    fbeta_score = sklearn.metrics.fbeta_score(y_test, y_pred, beta = 2)
    # Add metrics results
    df_RF.loc[len(df_RF.index)] = [f"{n}", f"{d}", f"{sensitivity}", f"{specificity}", f"{precision}", f"{f1_score}", f"{balanced_accuracy}", f"{fbeta_score}"]

In [None]:
# Sort the performance metrics by fbeta_score in ascending order
df_RF = df_RF.sort_values(by = "fbeta_score", ascending = False)
df_RF.to_csv("GSK3_RFresults3csv", index = False)

### Save models trained with train data

In [None]:
model = RandomForestClassifier(n_estimators = 100, max_depth = 10, n_jobs = 10, random_state = 0, class_weight = "balanced")
model.fit(X_train, y_train)
pickle.dump(model, open('modelRF_GSK3_100_10.pkl', 'wb'))

In [None]:
model = RandomForestClassifier(n_estimators = 70, max_depth = 10, n_jobs = 10, random_state = 0, class_weight = "balanced")
model.fit(X_train, y_train)
pickle.dump(model, open('modelRF_GSK3_70_10.pkl', 'wb'))

In [None]:
model = RandomForestClassifier(n_estimators = 150, max_depth = 10, n_jobs = 10, random_state = 0, class_weight = "balanced")
model.fit(X_train, y_train)
pickle.dump(model, open('modelRF_GSK3_150_10.pkl', 'wb'))

## C-Support Vector Classification Model


In [None]:
C_range = [3, 1, 0.01, 0.001]
df_SVC = pd.DataFrame(columns = ['C', 'sensitivity', 'specificity', 'precision', 'f1_score', 'balanced_accuracy', 'fbeta_score'])
for c in C_range:
    model = SVC(C = c, probability = True, class_weight = "balanced",  random_state = 0)
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    print(c)

    # Calculate the performance metrics
    sensitivity = recall_score(y_test, y_pred)
    specificity = recall_score(y_test, y_pred, pos_label = 0)
    precision = precision_score(y_test, y_pred)
    f1_score = sklearn.metrics.f1_score(y_test, y_pred)
    balanced_accuracy = balanced_accuracy_score(y_test, y_pred)
    fbeta_score = sklearn.metrics.fbeta_score(y_test, y_pred, beta = 2)
    df_SVC.loc[len(df_SVC.index)] = [f"{c}", f"{sensitivity}", f"{specificity}", f"{precision}", f"{f1_score}", f"{balanced_accuracy}", f"{fbeta_score}"]

In [None]:
# Sort the performance metrics by fbeta_score in ascending order
df_SVC = df_SVC.sort_values(by = "fbeta_score", ascending = False)
df_SVC.to_csv("GSK3_SVCresults.csv", index = False)

### Save models trained with train data

In [None]:
model = SVC(C = 3, class_weight = "balanced",  random_state = 0)
model.fit(X_train, y_train)
pickle.dump(model, open('modelSVC_GSK3_3.pkl', 'wb'))

In [None]:
model = SVC(C = 1, class_weight = "balanced",  random_state = 0)
model.fit(X_train, y_train)
pickle.dump(model, open('modelSVC_GSK3_1.pkl', 'wb'))

## Save Models trained with all data

In [None]:
# Complete NaN in each column with the median
X_without_NaN = X.fillna(X.median())
# Fit the scaler to the data and transform it
X_scl = scaler.fit_transform(X_without_NaN)

### Save Random Forest Models

In [None]:
model = RandomForestClassifier(n_estimators = 100, max_depth = 10, n_jobs = 10, random_state = 0, class_weight = "balanced")
model.fit(X_scl, y)
pickle.dump(model, open('modelALLRF_GSK3_100_10.pkl', 'wb'))

In [None]:
model = RandomForestClassifier(n_estimators = 70, max_depth = 10, n_jobs = 10, random_state = 0, class_weight = "balanced")
model.fit(X_scl, y)
pickle.dump(model, open('modelALLRF_GSK3_70_10.pkl', 'wb'))

In [None]:
model = RandomForestClassifier(n_estimators = 150, max_depth = 10, n_jobs = 10, random_state = 0, class_weight = "balanced")
model.fit(X_scl, y)
pickle.dump(model, open('modelALLRF_GSK3_150_10.pkl', 'wb'))

### Save SVC Models

In [None]:
model = SVC(C = 3, class_weight = "balanced",  random_state = 0)
model.fit(X_scl, y)
pickle.dump(model, open('modelALLSVC_GSK3_3.pkl', 'wb'))

In [None]:
model = SVC(C = 1, class_weight = "balanced",  random_state = 0)
model.fit(X_scl, y)
pickle.dump(model, open('modelALLSVC_GSK3_1.pkl', 'wb'))