In [40]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

In [52]:
train_data = pd.read_csv('../datasets/horse_health/train.csv')
origin = pd.read_csv('../datasets/horse_health/horse.csv')
train_data.drop('id', axis=1, inplace=True)
train_data = pd.concat([train_data, origin], axis=0)
train_data.drop_duplicates(inplace=True)

test_data = pd.read_csv('../datasets/horse_health/test.csv')
test_id = test_data['id']

test_data.drop('id', axis=1, inplace=True)

y = train_data['outcome']
train_data.drop('outcome', axis=1, inplace=True)

combined_data = pd.concat([train_data, test_data], axis=0, ignore_index=True)
combined_data.columns
y.shape

(1531,)

In [42]:
missing_cols_train = train_data.columns[train_data.isnull().any()]
missing_cols_test = test_data.columns[test_data.isnull().any()]

missing_train_rows = pd.DataFrame(train_data[train_data.isnull().any(axis=1)])
missing_test_rows = pd.DataFrame(test_data[test_data.isnull().any(axis=1)])

missing_train_rows.to_csv('./missing_train_rows.csv', index=True)

In [43]:
missing_train_rows.drop(missing_cols_train, axis=1, inplace=True)
missing_test_rows.drop(missing_cols_test, axis=1, inplace=True)

In [44]:
### Utility functions
def get_num_cols(df):
    return [col for col in df.columns if df[col].dtype in ['int64', 'float64']]

def get_cat_cols(df):
    return [col for col in df.columns if df[col].dtype == 'object']

def transform_live_death_binary(df, cols):
    for col in cols:
        if df[col].dtype in ['int64', 'float64']:
            continue
        df[col] = df[col].apply(lambda x: 1 if x == 'euthanized' else 0 if x == 'lived' else 2)
        df[col] = df[col].astype('int64')
    return df

def transform_age(df, cols):
    for col in cols:
        df[col] = df[col].apply(lambda x: 1 if x == 'adult' else 0)
        df[col] = df[col].astype('int64')
    return df

def transform_y_n(df, col):
    df[col] = df[col].apply(lambda x: 1 if x == 'yes' else 0)
    return df

def treat_na(df, cols, filler_val):
    for col in cols:
        df[col] = df[col].fillna(filler_val)
    return df

def map_values(df, col, map):
    df[col] = df[col].map(map)
    return df



In [45]:
pain_map = {
    "depressed":1,
    "mild_pain":2,
    "extreme_pain":4,
    "alert":3,
    "severe_pain":5,
    "slight":0
}

y.map({'lived':0, 'died':2, 'euthanized':1})

map_values(combined_data, 'pain', pain_map)
map_values(combined_data, 'age', {'adult':1,'young':0})
map_values(combined_data, 'capillary_refill_time', {'less_3_sec':0, 'more_3_sec':1, '3':2})
map_values(combined_data, 'peripheral_pulse', {'normal':0, 'increased':1, 'reduced':2, 'absent':3})
map_values(combined_data, 'mucous_membrane', {'normal_pink':0, 'pale_pink':1, 'pale_cyanotic':2, 'bright_pink':3, 'bright_red':4, 'dark_cyanotic':5})
map_values(combined_data, 'abdominal_distention', {'none':0, 'slight':1, 'moderate':2, 'severe':3})
map_values(combined_data, 'nasogastric_tube', {'slight':0, 'none':1, 'significant':2})
map_values(combined_data, 'nasogastric_reflux', {'none':0, 'more_1_liter':1, 'less_1_liter':2})
map_values(combined_data, 'rectal_exam_feces', {'absent':0, 'normal':1, 'decreased':2, 'increased':3})
map_values(combined_data, 'abdomen', {'distend_large':0, 'distend_small':1, 'normal':2, 'other':3})
map_values(combined_data, 'abdomo_appearance', {'cloudy':0, 'serosanguious':1, 'clear':2})
map_values(combined_data, 'peristalsis', {'absent':0, 'hypermotile':1, 'hypomotile':2, 'normal':3, 'distended_small':4})
map_values(combined_data, 'temp_of_extremities', {'cold':0, 'warm':1, 'normal':2, 'cool':3})

transform_y_n(combined_data, 'surgery')
transform_y_n(combined_data, 'cp_data')
transform_y_n(combined_data, 'surgical_lesion')

Unnamed: 0,surgery,age,hospital_number,rectal_temp,pulse,respiratory_rate,temp_of_extremities,peripheral_pulse,mucous_membrane,capillary_refill_time,...,abdomen,packed_cell_volume,total_protein,abdomo_appearance,abdomo_protein,surgical_lesion,lesion_1,lesion_2,lesion_3,cp_data
0,1,1,530001,38.1,132.0,24.0,3.0,2.0,5.0,1.0,...,1.0,57.0,8.5,1.0,3.4,1,2209,0,0,0
1,1,1,533836,37.5,88.0,12.0,3.0,0.0,2.0,1.0,...,1.0,33.0,64.0,1.0,2.0,1,2208,0,0,0
2,1,1,529812,38.3,120.0,28.0,3.0,2.0,1.0,0.0,...,0.0,37.0,6.4,1.0,3.4,1,5124,0,0,0
3,1,1,5262541,37.1,72.0,30.0,0.0,2.0,1.0,1.0,...,1.0,53.0,7.0,0.0,3.9,1,2208,0,0,1
4,0,1,5299629,38.0,52.0,48.0,2.0,0.0,0.0,0.0,...,2.0,47.0,7.3,0.0,2.6,0,0,0,0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2350,0,1,529461,40.3,114.0,36.0,3.0,2.0,0.0,1.0,...,0.0,57.0,8.1,1.0,4.5,1,3205,0,0,1
2351,1,1,535338,37.2,100.0,20.0,3.0,2.0,2.0,1.0,...,1.0,50.0,66.0,1.0,2.0,1,2209,0,0,0
2352,1,1,529640,39.2,132.0,12.0,3.0,2.0,5.0,1.0,...,,53.0,7.6,1.0,4.5,1,2205,0,0,0
2353,0,1,5287179,38.3,54.0,66.0,2.0,0.0,0.0,0.0,...,,49.0,8.6,2.0,5.0,0,3111,0,0,1


In [46]:
train_data = combined_data.
missing_cols_train = train_data.columns[train_data.isnull().any()]
missing_cols_test = test_data.columns[test_data.isnull().any()]

missing_train_rows = pd.DataFrame(train_data[train_data.isnull().any(axis=1)])
missing_test_rows = pd.DataFrame(test_data[test_data.isnull().any(axis=1)])

missing_train_rows.drop(missing_cols_train, axis=1, inplace=True)

missing_train_rows

Unnamed: 0,surgery,age,hospital_number,surgical_lesion,lesion_1,lesion_2,lesion_3,cp_data
2,yes,adult,529812,yes,5124,0,0,no
6,yes,adult,534787,no,3111,0,0,yes
9,yes,adult,529640,yes,3209,0,0,yes
14,yes,adult,528134,yes,2206,0,0,yes
15,no,adult,528305,no,31110,0,0,yes
...,...,...,...,...,...,...,...,...
294,yes,adult,533886,no,3205,0,0,no
295,no,adult,527702,yes,2208,0,0,yes
296,yes,adult,529386,yes,3205,0,0,no
297,yes,adult,530612,yes,2208,0,0,yes


In [47]:
num_cols = get_num_cols(train_data)
cat_cols = get_cat_cols(train_data)
unique_vals = {col: train_data[col].nunique() for col in cat_cols}

#print(f"{num_cols}'\n'{cat_cols}'\n'{unique_vals}")
print(cat_cols)

['surgery', 'age', 'temp_of_extremities', 'peripheral_pulse', 'mucous_membrane', 'capillary_refill_time', 'pain', 'peristalsis', 'abdominal_distention', 'nasogastric_tube', 'nasogastric_reflux', 'rectal_exam_feces', 'abdomen', 'abdomo_appearance', 'surgical_lesion', 'cp_data']


In [48]:
print(train_data.shape)
train_data.drop(missing_train_rows.index, axis=0, inplace=True)
train_data.reset_index(drop=True, inplace=True)
print(train_data.shape)
train_data.isnull().sum()

(1531, 27)
(586, 27)


surgery                  0
age                      0
hospital_number          0
rectal_temp              0
pulse                    0
respiratory_rate         0
temp_of_extremities      0
peripheral_pulse         0
mucous_membrane          0
capillary_refill_time    0
pain                     0
peristalsis              0
abdominal_distention     0
nasogastric_tube         0
nasogastric_reflux       0
nasogastric_reflux_ph    0
rectal_exam_feces        0
abdomen                  0
packed_cell_volume       0
total_protein            0
abdomo_appearance        0
abdomo_protein           0
surgical_lesion          0
lesion_1                 0
lesion_2                 0
lesion_3                 0
cp_data                  0
dtype: int64

In [49]:
from catboost import CatBoostClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report

def pred_missing_cols(df, cols, df_to_change, map, model):
    for col in cols:
        X = df.drop(col, axis=1)
        y = df[col]
        X.drop(cat_cols, inplace=True, axis=1)
        X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
        
        model.fit(X_train, y_train, verbose=False, early_stopping_rounds=5)
        y_pred = model.predict(X_test)
        print(f"Accuracy score for {col}: {accuracy_score(y_test, y_pred)}")
        print(f"Confusion matrix for {col}: \n{confusion_matrix(y_test, y_pred)}")
        print(f"Classification report for {col}: \n{classification_report(y_test, y_pred)}")
        print("\n\n")
        
        
    

model = CatBoostClassifier(iterations=1000, learning_rate=0.1, depth=5, loss_function='MultiClass')

In [50]:
X = train_data.drop('pain', axis=1)
y = train_data['pain']
X.drop(cat_cols, inplace=True, axis=1)

hist = model.fit(X, y, verbose=False, early_stopping_rounds=5)

KeyError: "['pain'] not found in axis"

In [None]:
model.score(X, y)

In [None]:
pred_class = model.predict(missing_train_rows)
pred_prob = model.predict_proba(missing_train_rows)

In [None]:
pain_map_reverse = {v:k for k,v in pain_map.items()}
missing_train_rows['pain'] = pred_class
missing_train_rows['pain'] = missing_train_rows['pain'].map(pain_map_reverse)

In [None]:
missing_train_rows['pain']

In [None]:
from sklearn.preprocessing import OneHotEncoder, LabelEncoder, StandardScaler
from sklearn.compose import ColumnTransformer

def treat_missing_rows(train_df, test_df, missing_rows, missing_cols):
    num_cols = get_num_cols(missing_rows)
    cat_cols = get_cat_cols(missing_rows)
    
    ohe = OneHotEncoder()
    encoded_label = ohe.fit_transform(missing_rows[cat_cols])
    print(encoded_label)
    