In [121]:
import pandas as pd

import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

##Importing train data

In [122]:
train_data = pd.read_csv('../datasets/horse_health/train.csv')
origin = pd.read_csv('../datasets/horse_health/horse.csv')
origin.to_csv('./origin.csv', index=False)

In [123]:
train_data.drop('id', axis=1, inplace=True)
train_data = pd.concat([train_data, origin], ignore_index=True)
train_data.drop_duplicates(inplace=True)

In [124]:
y = train_data['outcome']
train_data.drop('outcome', axis=1, inplace=True)
y.isnull().sum()

0

##Importing test data

In [125]:
test_data = pd.read_csv('../datasets/horse_health/test.csv')
test_id = test_data['id']
test_data.drop('id', axis=1, inplace=True)

In [126]:
combined_data = pd.concat([train_data, test_data], ignore_index=True)

combined_data.drop(['lesion_2', 'lesion_3'], axis=1, inplace=True)

In [127]:
### Utility functions
def get_num_cols(df):
    return [col for col in df.columns if df[col].dtype in ['int64', 'float64']]

def get_cat_cols(df):
    return [col for col in df.columns if df[col].dtype == 'object']

def map_values(df, col, map):
    df[col] = df[col].map(map)
    return df

In [128]:
pain_map = {
    "depressed":1,
    "mild_pain":2,
    "extreme_pain":4,
    "alert":3,
    "severe_pain":5,
    "slight":0
}

live_death_map = {
    "lived":0,
    "died":2,
    "euthanized":1
}

age_map = {
    "adult":1,
    "young":0
}

capillary_refill_time_map = {
    "less_3_sec":0,
    "more_3_sec":1,
    "3":2
}

peripheral_pulse_map = {
    "normal":0,
    "increased":1,
    "reduced":2,
    "absent":3
}

mucous_membrane_map = {
    "normal_pink":0,
    "pale_pink":1,
    "pale_cyanotic":2,
    "bright_pink":3,
    "bright_red":4,
    "dark_cyanotic":5
}

abdominal_distention_map = {
    "none":0,
    "slight":1,
    "moderate":2,
    "severe":3
}

nasogastric_tube_map = {
    "slight":0,
    "none":1,
    "significant":2
}

nasogastric_reflux_map = {
    "none":0,
    "more_1_liter":1,
    "less_1_liter":2
}

rectal_exam_feces_map = {
    "absent":0,
    "normal":1,
    "decreased":2,
    "increased":3
}

abdomen_map = {
    "distend_large":0,
    "distend_small":1,
    "normal":2,
    "other":3
}

abdomo_appearance_map = {
    "cloudy":0,
    "serosanguious":1,
    "clear":2
}

peristalsis_map = {
    "absent":0,
    "hypermotile":1,
    "hypomotile":2,
    "normal":3,
    "distended_small":4
}

temp_of_extremities_map = {
    "cold":0,
    "warm":1,
    "normal":2,
    "cool":3
}

surgery_map = cp_data_map = surgical_lesion_map = {
    "yes":1,
    "no":0
}

y = y.map(live_death_map)

map_values(combined_data, 'pain', pain_map)
map_values(combined_data, 'age', age_map)
map_values(combined_data, 'capillary_refill_time', capillary_refill_time_map)
map_values(combined_data, 'peripheral_pulse', peripheral_pulse_map)
map_values(combined_data, 'mucous_membrane', mucous_membrane_map)
map_values(combined_data, 'abdominal_distention', abdominal_distention_map)
map_values(combined_data, 'nasogastric_tube', nasogastric_tube_map)
map_values(combined_data, 'nasogastric_reflux', nasogastric_reflux_map)
map_values(combined_data, 'rectal_exam_feces', rectal_exam_feces_map)
map_values(combined_data, 'abdomen', abdomen_map)
map_values(combined_data, 'abdomo_appearance', abdomo_appearance_map)
map_values(combined_data, 'peristalsis', peristalsis_map)
map_values(combined_data, 'temp_of_extremities', temp_of_extremities_map)
map_values(combined_data, 'surgery', surgery_map)
map_values(combined_data, 'cp_data', cp_data_map)
map_values(combined_data, 'surgical_lesion', surgical_lesion_map)


Unnamed: 0,surgery,age,hospital_number,rectal_temp,pulse,respiratory_rate,temp_of_extremities,peripheral_pulse,mucous_membrane,capillary_refill_time,...,nasogastric_reflux_ph,rectal_exam_feces,abdomen,packed_cell_volume,total_protein,abdomo_appearance,abdomo_protein,surgical_lesion,lesion_1,cp_data
0,1,1,530001,38.1,132.0,24.0,3.0,2.0,5.0,1.0,...,6.5,2.0,1.0,57.0,8.5,1.0,3.4,1,2209,0
1,1,1,533836,37.5,88.0,12.0,3.0,0.0,2.0,1.0,...,2.0,0.0,1.0,33.0,64.0,1.0,2.0,1,2208,0
2,1,1,529812,38.3,120.0,28.0,3.0,2.0,1.0,0.0,...,3.5,,0.0,37.0,6.4,1.0,3.4,1,5124,0
3,1,1,5262541,37.1,72.0,30.0,0.0,2.0,1.0,1.0,...,2.0,2.0,1.0,53.0,7.0,0.0,3.9,1,2208,1
4,0,1,5299629,38.0,52.0,48.0,2.0,0.0,0.0,0.0,...,7.0,1.0,2.0,47.0,7.3,0.0,2.6,0,0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2350,0,1,529461,40.3,114.0,36.0,3.0,2.0,0.0,1.0,...,7.0,1.0,0.0,57.0,8.1,1.0,4.5,1,3205,1
2351,1,1,535338,37.2,100.0,20.0,3.0,2.0,2.0,1.0,...,2.0,0.0,1.0,50.0,66.0,1.0,2.0,1,2209,0
2352,1,1,529640,39.2,132.0,12.0,3.0,2.0,5.0,1.0,...,6.5,2.0,,53.0,7.6,1.0,4.5,1,2205,0
2353,0,1,5287179,38.3,54.0,66.0,2.0,0.0,0.0,0.0,...,7.0,1.0,,49.0,8.6,2.0,5.0,0,3111,1


In [129]:
train_data = pd.DataFrame(combined_data.iloc[:y.shape[0]])
train_data['outcome'] = y
train_data.to_csv('./train_data.csv', index=False)
test_data = combined_data.iloc[y.shape[0]:]

In [130]:
missing_cols_train = train_data.isnull().sum()[train_data.isnull().sum() > 0].index
missing_cols_test = test_data.isnull().sum()[test_data.isnull().sum() > 0].index

missing_train_rows = pd.DataFrame(train_data[train_data.isnull().any(axis=1)])
missing_test_rows = pd.DataFrame(test_data[test_data.isnull().any(axis=1)])

In [131]:
missing_train_rows

Unnamed: 0,surgery,age,hospital_number,rectal_temp,pulse,respiratory_rate,temp_of_extremities,peripheral_pulse,mucous_membrane,capillary_refill_time,...,rectal_exam_feces,abdomen,packed_cell_volume,total_protein,abdomo_appearance,abdomo_protein,surgical_lesion,lesion_1,cp_data,outcome
2,1,1,529812,38.3,120.0,28.0,3.0,2.0,1.0,0.0,...,,0.0,37.0,6.4,1.0,3.4,1,5124,0,0.0
5,0,1,529642,38.1,56.0,32.0,2.0,0.0,3.0,0.0,...,2.0,,49.0,8.0,0.0,2.8,0,0,1,0.0
6,1,1,534787,38.3,36.0,16.0,3.0,2.0,0.0,0.0,...,2.0,,43.0,75.0,0.0,1.0,0,3111,1,1.0
8,0,1,528742,37.4,48.0,12.0,3.0,2.0,0.0,0.0,...,1.0,,40.0,7.8,0.0,2.6,0,0,1,0.0
9,1,1,529640,38.3,129.0,48.0,3.0,2.0,1.0,0.0,...,,,57.0,4.9,0.0,2.9,1,3209,1,2.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1526,1,1,533886,,120.0,70.0,0.0,,2.0,1.0,...,,0.0,55.0,65.0,,,0,3205,0,2.0
1527,0,1,527702,37.2,72.0,24.0,3.0,1.0,2.0,1.0,...,0.0,1.0,44.0,,1.0,3.3,1,2208,1,1.0
1528,1,1,529386,37.5,72.0,30.0,0.0,2.0,2.0,0.0,...,2.0,0.0,60.0,6.8,,,1,3205,0,0.0
1529,1,1,530612,36.5,100.0,24.0,3.0,2.0,1.0,0.0,...,0.0,1.0,50.0,6.0,1.0,3.4,1,2208,1,1.0


In [132]:
missing_train_rows.to_csv('./missing_train_rows.csv', index=True)

In [133]:
num_cols = get_num_cols(train_data)
cat_cols = get_cat_cols(train_data)
unique_vals = {col: train_data[col].nunique() for col in cat_cols}

In [134]:
train_data.drop(missing_train_rows.index, axis=0, inplace=True)
train_data.reset_index(drop=True, inplace=True)
train_data.isnull().sum()

surgery                  0
age                      0
hospital_number          0
rectal_temp              0
pulse                    0
respiratory_rate         0
temp_of_extremities      0
peripheral_pulse         0
mucous_membrane          0
capillary_refill_time    0
pain                     0
peristalsis              0
abdominal_distention     0
nasogastric_tube         0
nasogastric_reflux       0
nasogastric_reflux_ph    0
rectal_exam_feces        0
abdomen                  0
packed_cell_volume       0
total_protein            0
abdomo_appearance        0
abdomo_protein           0
surgical_lesion          0
lesion_1                 0
cp_data                  0
outcome                  0
dtype: int64

In [135]:
from catboost import CatBoostClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report

def pred_missing_cols(df, pure_df, model):
    missing_cols = df.isnull().sum()[df.isnull().sum() > 0].index
    df.drop(missing_cols, axis=1, inplace=True)
    print(missing_cols)
    cat_cols = get_cat_cols(df)
    
    for col in missing_cols:
        
        X = pure_df.copy()
        y = pure_df[col]
        X.drop(missing_cols, inplace=True, axis=1)
        
        model.fit(X, y, verbose=False, early_stopping_rounds=5)
        y_pred = model.predict(df)
        df[col] = y_pred
        
        # print(f"Accuracy score for {col}: {accuracy_score(y_test, y_pred)}")
        # print(f"Confusion matrix for {col}: \n{confusion_matrix(y_test, y_pred)}")
        # print(f"Classification report for {col}: \n{classification_report(y_test, y_pred)}")
        # print("\n\n")
        
model = CatBoostClassifier(iterations=1000, learning_rate=0.1, depth=5, loss_function='MultiClass')

pred_missing_cols(missing_train_rows, train_data, model)
print(missing_train_rows.isnull().sum())

Index(['rectal_temp', 'pulse', 'respiratory_rate', 'temp_of_extremities',
       'peripheral_pulse', 'mucous_membrane', 'capillary_refill_time', 'pain',
       'peristalsis', 'abdominal_distention', 'nasogastric_tube',
       'nasogastric_reflux', 'nasogastric_reflux_ph', 'rectal_exam_feces',
       'abdomen', 'packed_cell_volume', 'total_protein', 'abdomo_appearance',
       'abdomo_protein', 'outcome'],
      dtype='object')
surgery                  0
age                      0
hospital_number          0
surgical_lesion          0
lesion_1                 0
cp_data                  0
rectal_temp              0
pulse                    0
respiratory_rate         0
temp_of_extremities      0
peripheral_pulse         0
mucous_membrane          0
capillary_refill_time    0
pain                     0
peristalsis              0
abdominal_distention     0
nasogastric_tube         0
nasogastric_reflux       0
nasogastric_reflux_ph    0
rectal_exam_feces        0
abdomen                  0
pa

In [136]:
missing_train_rows = missing_train_rows.astype('int64')
#missing_train_rows.to_csv('./missing_train_rows.csv', index=False)

In [137]:
train_data = pd.concat([train_data, missing_train_rows], ignore_index=True)
#train_data.to_csv('./train_data.csv', index=False)
train_data.isnull().sum()

surgery                  0
age                      0
hospital_number          0
rectal_temp              0
pulse                    0
respiratory_rate         0
temp_of_extremities      0
peripheral_pulse         0
mucous_membrane          0
capillary_refill_time    0
pain                     0
peristalsis              0
abdominal_distention     0
nasogastric_tube         0
nasogastric_reflux       0
nasogastric_reflux_ph    0
rectal_exam_feces        0
abdomen                  0
packed_cell_volume       0
total_protein            0
abdomo_appearance        0
abdomo_protein           0
surgical_lesion          0
lesion_1                 0
cp_data                  0
outcome                  0
dtype: int64

In [138]:
X_train, X_test, y_train, y_test = train_test_split(train_data.drop('outcome', axis=1), train_data['outcome'], test_size=0.2, random_state=42)

In [139]:
model.fit(X_train, y_train)
y_pred = model.predict(X_test)
print(f"Accuracy score: {accuracy_score(y_test, y_pred)}")
print(f"Confusion matrix: \n{confusion_matrix(y_test, y_pred)}")
print(f"Classification report: \n{classification_report(y_test, y_pred)}")

0:	learn: 1.0422348	total: 1.49ms	remaining: 1.49s
1:	learn: 0.9978633	total: 3.09ms	remaining: 1.54s
2:	learn: 0.9561983	total: 4.47ms	remaining: 1.49s
3:	learn: 0.9208494	total: 5.9ms	remaining: 1.47s
4:	learn: 0.8929671	total: 7.17ms	remaining: 1.43s
5:	learn: 0.8656270	total: 8.22ms	remaining: 1.36s
6:	learn: 0.8387421	total: 9.49ms	remaining: 1.35s
7:	learn: 0.8186498	total: 10.9ms	remaining: 1.35s
8:	learn: 0.8001608	total: 12.3ms	remaining: 1.35s
9:	learn: 0.7853540	total: 13.6ms	remaining: 1.35s
10:	learn: 0.7696373	total: 14.9ms	remaining: 1.34s
11:	learn: 0.7544401	total: 16.2ms	remaining: 1.33s
12:	learn: 0.7402156	total: 17.6ms	remaining: 1.34s
13:	learn: 0.7247402	total: 19ms	remaining: 1.34s
14:	learn: 0.7143034	total: 20.4ms	remaining: 1.34s
15:	learn: 0.7050719	total: 21.7ms	remaining: 1.33s
16:	learn: 0.6952732	total: 23.1ms	remaining: 1.34s
17:	learn: 0.6852432	total: 24.5ms	remaining: 1.33s
18:	learn: 0.6764864	total: 25.8ms	remaining: 1.33s
19:	learn: 0.6683549	tota

In [140]:
y_pred = model.predict(test_data)

In [141]:
reverse_l_d = {v:k for k,v in live_death_map.items()}
submission = pd.DataFrame(y_pred, index=test_id).rename(columns={0:'outcome'})

In [142]:
submission['outcome'] = submission['outcome'].map(reverse_l_d)
submission.to_csv('./submission.csv', index=True)