In [35]:
import numpy as np
import pandas as pd
pd.set_option('display.max_rows', None)
pd.set_option('display.max_columns', None)
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import f1_score, make_scorer
from imblearn.over_sampling import RandomOverSampler
from sklearn.preprocessing import MinMaxScaler, LabelEncoder

from sklearn.svm import SVC
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.metrics import precision_score, accuracy_score, f1_score, classification_report
from sklearn.metrics import f1_score

### Obtaining and understanding data:

In [36]:
data = pd.read_csv('data/train.csv')
add_data = pd.read_csv('data/horse.csv')

In [37]:
data.drop(['id', 'hospital_number', 'abdomen'], axis = 1, inplace = True)
add_data.drop(['hospital_number', 'abdomen'], axis = 1, inplace = True)
data = pd.concat([data, add_data])

In [38]:
float_columns = data.select_dtypes(include=['float64']).columns
object_columns = data.select_dtypes(include=['object']).columns

In [39]:
data[float_columns].isnull().sum()

rectal_temp               60
pulse                     24
respiratory_rate          58
nasogastric_reflux_ph    246
packed_cell_volume        29
total_protein             33
abdomo_protein           198
dtype: int64

In [40]:
data[object_columns].isnull().sum()

surgery                    0
age                        0
temp_of_extremities       95
peripheral_pulse         129
mucous_membrane           68
capillary_refill_time     38
pain                      99
peristalsis               64
abdominal_distention      79
nasogastric_tube         184
nasogastric_reflux       127
rectal_exam_feces        292
abdomo_appearance        213
surgical_lesion            0
cp_data                    0
outcome                    0
dtype: int64

### Preprocessing the data

In [41]:
def clean_data(clean_data):

    #clean_data.drop(columns = ['pain', 'age' , 'nasogastric_reflux', 'lesion_1', 'lesion_2', 'lesion_3', 'rectal_temp', 'mucous_membrane', 'cp_data', 'nasogastric_tube'], inplace = True)
    
    clean_data['temp_of_extremities'].fillna('cool', inplace = True)
    clean_data['mucous_membrane'].fillna('other', inplace = True)
    clean_data['mucous_membrane'] = clean_data['mucous_membrane'].apply(lambda x: 'other' if x in ['absent', 'increased'] else x)
    clean_data['capillary_refill_time'].fillna('less_3_sec', inplace = True)
    clean_data['capillary_refill_time'] = clean_data['capillary_refill_time'].str.replace(pat = '3', repl = 'less_3_sec')
    clean_data['pain'].fillna('pain', inplace = True)
    clean_data['pain'] = clean_data['pain'].str.replace(pat = 'slight', repl = 'severe_pain')
    clean_data['peristalsis'].fillna('absent', inplace = True)
    clean_data['abdominal_distention'].fillna('severe', inplace = True)
    clean_data['nasogastric_tube'].fillna('missing', inplace = True)
    clean_data['nasogastric_reflux'] = clean_data['nasogastric_reflux'].str.replace(pat = 'slight', repl = 'missing')
    clean_data['nasogastric_reflux'].fillna('missing', inplace = True)
    clean_data['rectal_exam_feces'].fillna('missing', inplace = True)
    clean_data['rectal_exam_feces'] = clean_data['rectal_exam_feces'].str.replace(pat = 'serosanguious', repl = 'missing')
    clean_data['abdomo_appearance'].fillna('missing', inplace = True)

    for column in float_columns:
        data[column].fillna(data[column].median(skipna = True), inplace = True)
    
    data.outcome = data.outcome.map({'died': 0, 'euthanized': 1, 'lived': 2})

    return clean_data

In [42]:
data = clean_data(data)

minmax_scaler = MinMaxScaler()
minmax_scaler.fit(data[float_columns])
data[float_columns] = minmax_scaler.transform(data[float_columns])
label_encoders = {x: LabelEncoder() for x in object_columns}

for column in object_columns:
    label_encoders[column].fit(data[column])
    data[column] = label_encoders[column].transform(data[column])

y_train = data['outcome']
x_train = data.drop(columns = ['outcome'])
x_train = pd.get_dummies(x_train, columns = object_columns.drop('outcome'), dtype=np.int8)

In [43]:
ros = RandomOverSampler(random_state=42)
X_res, y_res = ros.fit_resample(x_train, y_train)

In [44]:
y = data['outcome']
data.drop(columns = ['outcome'], inplace = True)

X_train, X_test, y_train, y_test = train_test_split(X_res, y_res, test_size=0.2)
#Because of the low number of rows, bootstrap will be set to False.

custom_scorer = make_scorer(f1_score, greater_is_better=True)
param_grid = {'n_estimators': [50, 100, 150, 200, 250, 300],  # being the number of trees in the forest.
                  'min_samples_leaf': [3, 5, 10, 20],  # number of minimum samples required at a leaf node.
                  'min_samples_split': [3, 6, 9],  # number of minimum samples required to split an internal node.
                  'criterion': ['entropy'],  # measures the quality of a split. Can use gini's impurity or entropy.
                  # 'subsample':[0.5,0.8,1]#buscar con mas detalle
                  # 'reg_lambda':[1,10,100]#buscar con mas detalle
                  }

clf = GridSearchCV(
# Evaluates the performance of different groups of parameters for a model based on cross-validation.
    RandomForestClassifier(class_weight='balanced', bootstrap=False, random_state=1234),
    param_grid,  # dict of parameters.
    cv=10,  # Specified number of folds in the Cross-Validation(K-Fold).
    scoring='f1_micro')
        # clf = GridSearchCV(DecisionTreeClassifier(max_leaf_nodes=3, random_state=0), param_grid2)
clf.fit(X_train, y_train)
        # print("Best estimator found by grid search:")
        # print(clf.best_estimator_)
model = clf.best_estimator_  # Estimator that was chosen by the search, i.e. estimator which gave highest score (or smallest loss if specified) on the left out data        model.fit(X_train, y_train)


print('RandomForest:')
y_pred_rf = model.predict(X_test)
print(classification_report(y_pred_rf, y_test))
print(f1_score(y_test, y_pred_rf, average='micro'))

RandomForest:
              precision    recall  f1-score   support

           0       0.89      0.78      0.83       165
           1       0.94      0.88      0.91       160
           2       0.70      0.87      0.77       127

    accuracy                           0.84       452
   macro avg       0.84      0.84      0.84       452
weighted avg       0.85      0.84      0.84       452

0.838495575221239
