In [1]:
import numpy as np
import pandas as pd
pd.set_option('display.max_rows', None)
pd.set_option('display.max_columns', None)
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import f1_score, make_scorer
from imblearn.over_sampling import RandomOverSampler
from sklearn.preprocessing import MinMaxScaler, LabelEncoder

from sklearn.svm import SVC
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.metrics import precision_score, accuracy_score, f1_score, classification_report
from sklearn.metrics import f1_score

### Obtaining and understanding data:

In [2]:
original_data = pd.read_csv('data/train.csv')
additional_data = pd.read_csv('data/horse.csv')

In [3]:
original_data.drop(['id'], axis = 1, inplace = True)
raw_data = pd.concat([original_data, additional_data])
raw_data.outcome = raw_data.outcome.map({'died': 0, 'euthanized': 1, 'lived': 2})

In [4]:
float_columns = raw_data.select_dtypes(include=['float64']).columns
object_columns = raw_data.select_dtypes(include=['object']).columns

In [5]:
raw_data[float_columns].isnull().sum()

rectal_temp               60
pulse                     24
respiratory_rate          58
nasogastric_reflux_ph    246
packed_cell_volume        29
total_protein             33
abdomo_protein           198
dtype: int64

In [6]:
raw_data[object_columns].isnull().sum()

surgery                    0
age                        0
temp_of_extremities       95
peripheral_pulse         129
mucous_membrane           68
capillary_refill_time     38
pain                      99
peristalsis               64
abdominal_distention      79
nasogastric_tube         184
nasogastric_reflux       127
rectal_exam_feces        292
abdomen                  331
abdomo_appearance        213
surgical_lesion            0
cp_data                    0
dtype: int64

In [7]:
[column for column in raw_data.columns if column not in float_columns.tolist() + object_columns.tolist()]

['hospital_number', 'lesion_1', 'lesion_2', 'lesion_3', 'outcome']

In [8]:
raw_target = raw_data['outcome']
raw_data.drop(columns = ['hospital_number', 'abdomen', 'lesion_2', 'lesion_3', 'outcome'], axis =1, inplace = True)
raw_data['lesion_1'] = raw_data['lesion_1'].astype(str)
#Update columns:
float_columns = raw_data.select_dtypes(include=['float64']).columns
object_columns = raw_data.select_dtypes(include=['object']).columns


### Preprocessing the data

In [9]:
def get_clean_data(clean_data, float_columns):

    #clean_data.drop(columns = ['pain', 'age' , 'nasogastric_reflux', 'lesion_1', 'lesion_2', 'lesion_3', 'rectal_temp', 'mucous_membrane', 'cp_data', 'nasogastric_tube'], inplace = True)
    clean_data['temp_of_extremities'].fillna('cool', inplace = True)
    clean_data['mucous_membrane'].fillna('other', inplace = True)
    clean_data['mucous_membrane'] = clean_data['mucous_membrane'].apply(lambda x: 'other' if x in ['absent', 'increased'] else x)
    clean_data['capillary_refill_time'].fillna('less_3_sec', inplace = True)
    clean_data['capillary_refill_time'] = clean_data['capillary_refill_time'].str.replace(pat = '3', repl = 'less_3_sec')
    clean_data['pain'].fillna('pain', inplace = True)
    clean_data['pain'] = clean_data['pain'].str.replace(pat = 'slight', repl = 'severe_pain')
    clean_data['peristalsis'].fillna('absent', inplace = True)
    clean_data['abdominal_distention'].fillna('severe', inplace = True)
    clean_data['nasogastric_tube'].fillna('missing', inplace = True)
    clean_data['nasogastric_reflux'] = clean_data['nasogastric_reflux'].str.replace(pat = 'slight', repl = 'missing')
    clean_data['nasogastric_reflux'].fillna('missing', inplace = True)
    clean_data['rectal_exam_feces'].fillna('missing', inplace = True)
    clean_data['rectal_exam_feces'] = clean_data['rectal_exam_feces'].str.replace(pat = 'serosanguious', repl = 'missing')
    clean_data['abdomo_appearance'].fillna('missing', inplace = True)

    for column in float_columns:
        clean_data[column].fillna(clean_data[column].median(skipna = True), inplace = True)

    
    return clean_data

In [10]:
clean_data = get_clean_data(raw_data, float_columns)
minmax_scaler = MinMaxScaler()
minmax_scaler.fit(clean_data[float_columns])
label_encoders = {x: LabelEncoder() for x in object_columns}

clean_data[float_columns] = minmax_scaler.transform(clean_data[float_columns])
for column in object_columns:
    label_encoders[column].fit(clean_data[column])
    clean_data[column] = label_encoders[column].transform(clean_data[column])

y_train = raw_target
x_train = clean_data
x_train = pd.get_dummies(x_train, columns = object_columns, dtype=np.int8)

In [11]:
ros = RandomOverSampler(random_state=42)
X_res, y_res = ros.fit_resample(x_train, y_train)

### Model training and validation

In [12]:
X_train, X_test, y_train, y_test = train_test_split(X_res, y_res, test_size=0.2)
#Because of the low number of rows, bootstrap will be set to False.

custom_scorer = make_scorer(f1_score, greater_is_better=True)
param_grid = {'n_estimators': [50, 100, 150, 200, 250, 300],  # being the number of trees in the forest.
                  'min_samples_leaf': [3, 5, 10, 20],  # number of minimum samples required at a leaf node.
                  'min_samples_split': [3, 6, 9],  # number of minimum samples required to split an internal node.
                  'criterion': ['entropy'],  # measures the quality of a split. Can use gini's impurity or entropy.
                  # 'subsample':[0.5,0.8,1]#buscar con mas detalle
                  # 'reg_lambda':[1,10,100]#buscar con mas detalle
                  }

clf = GridSearchCV(
# Evaluates the performance of different groups of parameters for a model based on cross-validation.
    RandomForestClassifier(class_weight='balanced', bootstrap=False, random_state=1234),
    param_grid,  # dict of parameters.
    cv=10,  # Specified number of folds in the Cross-Validation(K-Fold).
    scoring='f1_micro')
        # clf = GridSearchCV(DecisionTreeClassifier(max_leaf_nodes=3, random_state=0), param_grid2)
clf.fit(X_train, y_train)
        # print("Best estimator found by grid search:")
        # print(clf.best_estimator_)
model = clf.best_estimator_  # Estimator that was chosen by the search, i.e. estimator which gave highest score (or smallest loss if specified) on the left out data        model.fit(X_train, y_train)


print('RandomForest:')
y_pred_rf = model.predict(X_test)
print(classification_report(y_pred_rf, y_test))
print('Micro F1-Score:', f1_score(y_test, y_pred_rf, average='micro'))

RandomForest:
              precision    recall  f1-score   support

           0       0.91      0.82      0.86       182
           1       0.88      0.86      0.87       140
           2       0.73      0.84      0.78       130

    accuracy                           0.84       452
   macro avg       0.84      0.84      0.84       452
weighted avg       0.85      0.84      0.84       452

Micro F1-Score: 0.838495575221239
