In [None]:
#Competition: https://www.kaggle.com/competitions/playground-series-s3e22/data
#Original Dataset and column info: https://www.kaggle.com/datasets/yasserh/horse-survival-dataset

In [43]:
import numpy as np
import pandas as pd
from sklearn.preprocessing import MinMaxScaler, LabelEncoder

In [49]:
data = pd.read_csv('train.csv')
y = data['outcome']
data.drop(['id', 'hospital_number', 'outcome'], axis = 1, inplace = True)

### Structure of the data:

In [4]:
data.shape

(1235, 27)

In [5]:
data.isna().sum()

surgery                    0
age                        0
rectal_temp                0
pulse                      0
respiratory_rate           0
temp_of_extremities       39
peripheral_pulse          60
mucous_membrane           21
capillary_refill_time      6
pain                      44
peristalsis               20
abdominal_distention      23
nasogastric_tube          80
nasogastric_reflux        21
nasogastric_reflux_ph      0
rectal_exam_feces        190
abdomen                  213
packed_cell_volume         0
total_protein              0
abdomo_appearance         48
abdomo_protein             0
surgical_lesion            0
lesion_1                   0
lesion_2                   0
lesion_3                   0
cp_data                    0
outcome                    0
dtype: int64

In [6]:
data.dtypes

surgery                   object
age                       object
rectal_temp              float64
pulse                    float64
respiratory_rate         float64
temp_of_extremities       object
peripheral_pulse          object
mucous_membrane           object
capillary_refill_time     object
pain                      object
peristalsis               object
abdominal_distention      object
nasogastric_tube          object
nasogastric_reflux        object
nasogastric_reflux_ph    float64
rectal_exam_feces         object
abdomen                   object
packed_cell_volume       float64
total_protein            float64
abdomo_appearance         object
abdomo_protein           float64
surgical_lesion           object
lesion_1                   int64
lesion_2                   int64
lesion_3                   int64
cp_data                   object
outcome                   object
dtype: object

### Current Work

#### Each row has a record of a horse, its data and its outcome. There can be 1+ row per horse, meaning that a horse might have gone through multiple encounters

##### Hypothesis 1: Should hospital_number be dropped then?

#### Data Cleaning:

In [50]:
float_columns = data.select_dtypes(include=['float64']).columns
object_columns = data.select_dtypes(include=['object']).columns

for column in float_columns:
    data[column].fillna(data[column].median(skipna = True), inplace = True)

for column in object_columns:
    data[column].fillna('missing', inplace = True)

minmax_scaler = MinMaxScaler()
minmax_scaler.fit(data[float_columns])
data[float_columns] = minmax_scaler.transform(data[float_columns])

label_encoders = {x: LabelEncoder() for x in object_columns}

for column in object_columns:
    label_encoders[column].fit(data[column])
    data[column] = label_encoders[column].transform(data[column])

data = pd.get_dummies(data, columns = object_columns, dtype=np.int8)


#### Model training

In [46]:
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.metrics import precision_score, accuracy_score, f1_score, classification_report

In [51]:
X_train, X_test, y_train, y_test = train_test_split(data, y, test_size=0.2)
#Because of the low number of rows, bootstrap will be set to False.
rf = RandomForestClassifier(class_weight='balanced', n_estimators=225, bootstrap=False)
gb = GradientBoostingClassifier(random_state=1234)
lr = LogisticRegression()
knn = KNeighborsClassifier(n_neighbors=2)

print('RandomForest:')
rf.fit(X_train, y_train)
y_pred = rf.predict(X_test)
print(classification_report(y_pred, y_test))
print('-------------------------------')

RandomForest:
              precision    recall  f1-score   support

        died       0.69      0.68      0.68        87
  euthanized       0.58      0.56      0.57        50
       lived       0.68      0.70      0.69       110

    accuracy                           0.66       247
   macro avg       0.65      0.65      0.65       247
weighted avg       0.66      0.66      0.66       247

-------------------------------
