In [1]:
#!/bin/python

import sys
import random
import pandas as pd
import numpy as np
from sklearn.model_selection import StratifiedKFold
from sklearn.ensemble import RandomForestClassifier
import sklearn.metrics as metrics
from numpy import interp
import matplotlib.pyplot as plt
from sklearn.metrics import precision_recall_curve
from sklearn.metrics import f1_score
from sklearn.metrics import plot_precision_recall_curve
from sklearn import preprocessing
from pathlib import Path
from joblib import dump, load

In [2]:

n_tree= 1000
n_max_depth = 4
n_min_samples_split = 3
n_min_samples_leaf = 5
list_seed = [547, 254, 758]

# n_tree= 1000
# n_max_depth = 7
# n_min_samples_split = 3
# n_min_samples_leaf = 5
# list_seed = [419, 713, 13]

prediction_variable = 'ACR20'

std_cutoff = 0.05
Path("./Models").mkdir(parents=True, exist_ok=True)
Path("./Models/RF").mkdir(parents=True, exist_ok=True)

df=pd.read_csv('./Input_data/RA_dataframe_initial_fu_20210112_naiveusers.txt', sep='\t')

if prediction_variable == 'DFT1':
    df = df[df.DFT1 != 2]

if prediction_variable == 'ACR20':
    df=df[df.ACR20 != 3]

colnames = df.columns
result_column_list = ['newID', 'region', 'BSD', 'DFT1', 'DFT2', 'ACR20', 'ACR50', 'ACR70', 'EULAR']
x_colnames_1 = [item for item in colnames if item not in result_column_list] 
# They are not needed because it means results

In [3]:
df_training = df[(df.region!=1) & (df.region != 11)]
df_independent = df[(df.region == 1) | (df.region == 11)]

In [5]:
# 1) Training dataset

df_training_remov_result = df_training[x_colnames_1] # pre-scaled data
# Remove variables includes only one value.
df_training_remov_novar = df_training_remov_result.loc[:,df_training_remov_result.std() != 0]

# Remove variables includes only small variance.
pre_scaler = preprocessing.MinMaxScaler()
df_training_pre_scaled = pre_scaler.fit_transform(df_training_remov_novar)
remain_boolean = df_training_pre_scaled.std(axis=0) >= std_cutoff
colnames_remain = df_training_remov_novar.columns[remain_boolean]
# colnames_remain will be used for independent data again.

data_x_training_bf_scaling = df_training_remov_novar[colnames_remain].to_numpy()
data_y_training = df_training[prediction_variable].to_numpy()

In [7]:
# print(len(colnames_remain))

74


In [5]:
# 2) Independent dataset

data_x_independent_bf_scaling = df_independent[colnames_remain].to_numpy()
data_y_independent = df_independent[prediction_variable].to_numpy()

In [6]:
for seed in list_seed:
                    
    print(' '.join([prediction_variable, str(n_tree), str(n_max_depth), str(n_min_samples_split), str(n_min_samples_leaf)]))

    cv = StratifiedKFold(n_splits=3, shuffle=True, random_state=seed)

    i = 0

    for train, test in cv.split(data_x_training_bf_scaling, data_y_training):

        training_scaler = preprocessing.MinMaxScaler()

        data_x_training_train = training_scaler.fit_transform(data_x_training_bf_scaling[train])
        data_x_training_test = training_scaler.transform(data_x_training_bf_scaling[test])

        model = RandomForestClassifier(n_estimators=n_tree, max_depth = n_max_depth, 
                                       min_samples_split = n_min_samples_split, min_samples_leaf = n_min_samples_leaf)
        model.fit(data_x_training_train, data_y_training[train])
        
        dump(model, './Models/RF/RF_{0}_{1}_{2}_{3}_{4}_{5}_{6}.joblib'.format(
            prediction_variable, n_tree, n_max_depth, n_min_samples_split, n_min_samples_leaf, seed, i + 1))
        print('model is saved')
        
        i = i + 1


ACR20 1000 4 3 5
model is saved
model is saved
model is saved
ACR20 1000 4 3 5
model is saved
model is saved
model is saved
ACR20 1000 4 3 5
model is saved
model is saved
model is saved
