In [4]:
!pip install catboost



In [22]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import joblib

import warnings

from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import cross_val_score, KFold
from sklearn.metrics import f1_score, accuracy_score

from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
import xgboost as xgb
import catboost as cb

from sklearn.model_selection import GridSearchCV

train = pd.read_csv('/content/train.csv', index_col = "id")
test = pd.read_csv('/content/test.csv', index_col = "id")

In [6]:
train

Unnamed: 0_level_0,surgery,age,hospital_number,rectal_temp,pulse,respiratory_rate,temp_of_extremities,peripheral_pulse,mucous_membrane,capillary_refill_time,...,packed_cell_volume,total_protein,abdomo_appearance,abdomo_protein,surgical_lesion,lesion_1,lesion_2,lesion_3,cp_data,outcome
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
0,yes,adult,530001,38.1,132.0,24.0,cool,reduced,dark_cyanotic,more_3_sec,...,57.0,8.5,serosanguious,3.4,yes,2209,0,0,no,died
1,yes,adult,533836,37.5,88.0,12.0,cool,normal,pale_cyanotic,more_3_sec,...,33.0,64.0,serosanguious,2.0,yes,2208,0,0,no,euthanized
2,yes,adult,529812,38.3,120.0,28.0,cool,reduced,pale_pink,less_3_sec,...,37.0,6.4,serosanguious,3.4,yes,5124,0,0,no,lived
3,yes,adult,5262541,37.1,72.0,30.0,cold,reduced,pale_pink,more_3_sec,...,53.0,7.0,cloudy,3.9,yes,2208,0,0,yes,lived
4,no,adult,5299629,38.0,52.0,48.0,normal,normal,normal_pink,less_3_sec,...,47.0,7.3,cloudy,2.6,no,0,0,0,yes,lived
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1230,yes,adult,535246,38.5,129.0,48.0,cool,reduced,pale_pink,more_3_sec,...,57.0,66.0,serosanguious,2.0,yes,2206,0,0,no,lived
1231,yes,adult,528570,37.5,60.0,50.0,cool,reduced,pale_cyanotic,less_3_sec,...,35.0,6.4,serosanguious,3.6,yes,2209,0,0,yes,died
1232,yes,young,529685,37.5,84.0,40.0,normal,reduced,normal_pink,less_3_sec,...,40.0,5.9,cloudy,7.0,yes,400,0,0,yes,lived
1233,yes,adult,534784,38.1,70.0,16.0,normal,reduced,bright_red,less_3_sec,...,58.0,74.0,cloudy,2.0,yes,2209,0,0,no,lived


In [7]:
train.dtypes

surgery                   object
age                       object
hospital_number            int64
rectal_temp              float64
pulse                    float64
respiratory_rate         float64
temp_of_extremities       object
peripheral_pulse          object
mucous_membrane           object
capillary_refill_time     object
pain                      object
peristalsis               object
abdominal_distention      object
nasogastric_tube          object
nasogastric_reflux        object
nasogastric_reflux_ph    float64
rectal_exam_feces         object
abdomen                   object
packed_cell_volume       float64
total_protein            float64
abdomo_appearance         object
abdomo_protein           float64
surgical_lesion           object
lesion_1                   int64
lesion_2                   int64
lesion_3                   int64
cp_data                   object
outcome                   object
dtype: object

In [8]:
label = LabelEncoder()

train_rows = train.shape[0]
merged_df = pd.concat([train, test])

tmp_ext = {"None": -1, "warm": 1, "normal": 0, "cool": 2, "cold": 3}
per_purse = {"None": -1, "absent": 1, "reduced": 2, "normal": 0, "increased": 3}
cap_ref = {"None": -1, "less_3_sec": 1, "3": 2, "more_3_sec": 3}
pn = {"depressed": 1, "mild_pain": 2, "severe_pain": 3, "extreme_pain": 4, "alert": 5}
prtls = {"None": -1, "absent": 1, "hypomotile": 2, "normal": 0, "hypermotile": 3}
abd_dis = {"None": -1, "none": 0, "slight": 1, "moderate": 2, "severe": 3}
nag_tube = {"None": -1, "none": 0, "slight": 1, "significant": 2}
nag_flux = {"None": -1, "none": 0, "slight": 1, "less_1_liter": 2, "more_1_liter": 3}
rec_ex = {"None": -1, "absent": 1, "decreased": 2, "normal": 0, "increased": 3}
abd = {"None": -1, "firm": 1, "distend_small": 2, "normal": 0, "distend_large": 3}
abd_app = {"None": -1, "clear": 0, "cloudy": 1, "serosanguious": 2}

for col in ["surgery", "age", "mucous_membrane", "surgical_lesion", "cp_data", "outcome"]:
    merged_df[col] = label.fit_transform(merged_df[col])

merged_df["temp_of_extremities"] = merged_df["temp_of_extremities"].map(tmp_ext)
merged_df["peripheral_pulse"] = merged_df["peripheral_pulse"].map(per_purse)
merged_df["capillary_refill_time"] = merged_df["capillary_refill_time"].map(cap_ref)
merged_df["pain"] = merged_df["pain"].map(pn)
merged_df["peristalsis"] = merged_df["peristalsis"].map(prtls)
merged_df["abdominal_distention"] = merged_df["abdominal_distention"].map(abd_dis)
merged_df["nasogastric_tube"] = merged_df["nasogastric_tube"].map(nag_tube)
merged_df["nasogastric_reflux"] = merged_df["nasogastric_reflux"].map(nag_flux)
merged_df["rectal_exam_feces"] = merged_df["rectal_exam_feces"].map(rec_ex)
merged_df["abdomen"] = merged_df["abdomen"].map(abd)
merged_df["abdomo_appearance"] = merged_df["abdomo_appearance"].map(abd_app)

train = merged_df.iloc[:train_rows]
test = merged_df.iloc[train_rows:].drop(columns = 'outcome')

In [9]:
for df in [train, test]:
    df.drop(columns = ["hospital_number"], inplace=True)
    df.fillna(method = "ffill", inplace = True)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df.drop(columns = ["hospital_number"], inplace=True)
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df.fillna(method = "ffill", inplace = True)


In [10]:
df

Unnamed: 0_level_0,surgery,age,rectal_temp,pulse,respiratory_rate,temp_of_extremities,peripheral_pulse,mucous_membrane,capillary_refill_time,pain,...,abdomen,packed_cell_volume,total_protein,abdomo_appearance,abdomo_protein,surgical_lesion,lesion_1,lesion_2,lesion_3,cp_data
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1235,0,0,38.6,40.0,20.0,0,0,4,1,2.0,...,2.0,42.0,7.5,0,2.3,0,0,0,0,0
1236,1,0,38.2,112.0,48.0,2,2,1,3,1.0,...,2.0,44.0,6.0,2,2.6,0,2208,0,0,1
1237,1,0,37.7,66.0,12.0,2,0,2,1,2.0,...,2.0,31.5,6.0,1,1.6,1,2205,0,0,1
1238,0,0,37.1,88.0,20.0,2,2,5,1,1.0,...,3.0,75.0,81.0,-1,1.0,1,1400,0,0,0
1239,1,0,38.3,50.0,12.0,-1,0,1,1,2.0,...,2.0,37.0,6.8,1,2.6,1,2208,0,0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2054,0,0,40.3,114.0,36.0,2,2,4,3,1.0,...,3.0,57.0,8.1,2,4.5,1,3205,0,0,1
2055,1,0,37.2,100.0,20.0,2,2,5,3,4.0,...,2.0,50.0,66.0,2,2.0,1,2209,0,0,0
2056,1,0,39.2,132.0,12.0,2,2,3,3,1.0,...,-1.0,53.0,7.6,2,4.5,1,2205,0,0,0
2057,0,0,38.3,54.0,66.0,0,0,4,1,2.0,...,-1.0,49.0,8.6,0,5.0,0,3111,0,0,1


In [11]:
missing_perc_df = pd.DataFrame(index = ['train_dataframe', 'test_dataframe'], columns = test.columns)

for i, col in enumerate(test.columns):
    missing_perc_df.loc[missing_perc_df.index == 'train_dataframe', col] = np.round(train.isna().sum()[i] / train[col].shape * 100, 3)
    missing_perc_df.loc[missing_perc_df.index == 'test_dataframe', col] = np.round(test.isna().sum()[i] / test[col].shape * 100, 3)

for col in test.columns:
    missing_perc_df[col] = missing_perc_df[col].astype(str) + '%'

missing_perc_df

Unnamed: 0,surgery,age,rectal_temp,pulse,respiratory_rate,temp_of_extremities,peripheral_pulse,mucous_membrane,capillary_refill_time,pain,...,abdomen,packed_cell_volume,total_protein,abdomo_appearance,abdomo_protein,surgical_lesion,lesion_1,lesion_2,lesion_3,cp_data
train_dataframe,0.0%,0.0%,0.0%,0.0%,0.0%,0.0%,0.0%,0.0%,0.0%,0.0%,...,0.0%,0.0%,0.0%,0.0%,0.0%,0.0%,0.0%,0.0%,0.0%,0.0%
test_dataframe,0.0%,0.0%,0.0%,0.0%,0.0%,0.0%,0.0%,0.0%,0.0%,0.0%,...,0.0%,0.0%,0.0%,0.0%,0.0%,0.0%,0.0%,0.0%,0.0%,0.0%


In [12]:
X = train.iloc[:, :-1]
y = train.iloc[:, -1]

N_SPLITS = 5
kf = KFold(n_splits=N_SPLITS, shuffle=True, random_state=42)

In [13]:
models = {
    'logistic_regressor': LogisticRegression(random_state=42, max_iter = 1000),
    'randomforest_classifier': RandomForestClassifier(random_state=42, verbose = 0),
    'xgb_classifier': xgb.XGBClassifier(random_state=42, verbosity = 0),
    'catboost_classifier': cb.CatBoostClassifier(random_state = 42, logging_level="Silent")
}

for model_name, model in models.items():
    f1_scores = cross_val_score(model, X, y, cv=kf, scoring='f1_micro')
    avg_f1_score = f1_scores.mean()
    print(f'{model_name}\'s average F1 score across {N_SPLITS}-Fold CV is {avg_f1_score * 100:.3f}%')

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver opt

logistic_regressor's average F1 score across 5-Fold CV is 63.887%
randomforest_classifier's average F1 score across 5-Fold CV is 69.474%
xgb_classifier's average F1 score across 5-Fold CV is 70.364%
catboost_classifier's average F1 score across 5-Fold CV is 69.879%


In [33]:
model = xgb.XGBClassifier(n_estimators = 100,max_depth = 3, learning_rate = 0.5)
model.fit(X, y)

scores = cross_val_score(model, X, y, cv=5, scoring='accuracy')
mean_accuracy = scores.mean()
print(f"accuracy: {mean_accuracy * 100:.3f}%")

accuracy: 69.069%


In [32]:
xgb_model = xgb.XGBClassifier(random_state=42, verbosity=0)

param = {
    'n_estimators': [100, 200, 300],
    'max_depth': [3, 4, 5],
    'learning_rate': [0.01, 0.1, 0.2],
    'min_child_weight': [1, 2, 3]
}

grid_search = GridSearchCV(xgb_model, param, cv=kf, scoring='f1_micro')

grid_search.fit(X, y)

best_params = grid_search.best_params_
best_score = grid_search.best_score_
print(f'Best Hyperparameters: {best_params}')
print(f'Best F1 Score: {best_score * 100:.3f}%')

Best Hyperparameters: {'learning_rate': 0.1, 'max_depth': 3, 'min_child_weight': 1, 'n_estimators': 200}
Best F1 Score: 71.903%


In [55]:
model = xgb.XGBClassifier(random_state = 42, verbosity = 0, n_estimators = 100,
                          min_child_weight = 2, max_depth = 3, learning_rate = 0.1)
model.fit(X, y)

In [56]:
status = {0: 'died', 1: 'euthanized', 2: 'lived'}
submission = pd.DataFrame({'id': test.index, 'outcome': model.predict(test).astype(int)
})
submission['outcome'] = submission['outcome'].map(status)
submission.to_csv('submission.csv', index=False)

In [57]:
!pip install -q streamlit

In [58]:
!npm install -g localtunnel

[K[?25h/tools/node/bin/lt -> /tools/node/lib/node_modules/localtunnel/bin/lt.js
+ localtunnel@2.0.2
updated 1 package in 0.658s


In [59]:
import pickle

with open('model.pkl', 'wb') as file:
    pickle.dump(model, file)

In [60]:
%%writefile app.py

import streamlit as st
import pickle
import pandas as pd

def user_input_features():
  surgery = st.sidebar.slider('surgery', 0, 1)
  age = st.sidebar.slider('age', 0, 1)
  rectal_temp = st.sidebar.slider('rectal_temp', 35.4, 40.8, 0.1)
  pulse = st.sidebar.slider('pulse', 30.0, 184.0, 1.0)
  respiratory_rate = st.sidebar.slider('respiratory_rate', 8.0, 96.0, 1.0)
  temp_of_extremities = st.sidebar.slider('temp_of_extremities', -1, 3, 1)
  peripheral_pulse = st.sidebar.slider('peripheral_pulse', -1, 3, 1)
  mucous_membrane = st.sidebar.slider('mucous_membrane', 0, 6, 1)
  capillary_refill_time = st.sidebar.slider('capillary_refill_time', -1, 3, 1)
  pain = st.sidebar.slider('pain', -1.0, 5.0, 1.0)
  peristalsis = st.sidebar.slider('peristalsis', -1.0, 3.0, 1.0)
  abdominal_distention = st.sidebar.slider('abdominal_distention', -1, 3, 1)
  nasogastric_tube = st.sidebar.slider('nasogastric_tube', -1, 2, 1)
  nasogastric_reflux = st.sidebar.slider('nasogastric_reflux', -1, 3, 1)
  nasogastric_reflux_ph = st.sidebar.slider('nasogastric_reflux_ph', 1.0, 7.5, 0.1)
  rectal_exam_feces = st.sidebar.slider('rectal_exam_feces', -1.0, 3.0, 1.0)
  abdomen = st.sidebar.slider('abdomen', -1.0, 3.0, 1.0)
  packed_cell_volume = st.sidebar.slider('packed_cell_volume', 23.0, 75.0, 0.1)
  total_protein = st.sidebar.slider('total_protein', 3.9, 89.0, 0.1)
  abdomo_appearance = st.sidebar.slider('abdomo_appearance', -1, 2, 1)
  abdomo_protein = st.sidebar.slider('abdomo_protein', 0.1, 10.1, 0.1)
  surgical_lesion = st.sidebar.slider('surgical_lesion', 0, 1)
  lesion_1 = st.sidebar.slider('lesion_1', 0, 31110, 1)
  lesion_2 = st.sidebar.slider('lesion_2', 0, 4300, 1)
  lesion_3 = 0
  cp_data = st.sidebar.slider('cp_data', 0, 1)


  user_input_data = {'surgery': surgery,
               'age': age,
               'rectal_temp': rectal_temp,
               'pulse': pulse,
               'respiratory_rate': respiratory_rate,
               'temp_of_extremities': temp_of_extremities,
               'peripheral_pulse': peripheral_pulse,
               'mucous_membrane': mucous_membrane,
               'capillary_refill_time': capillary_refill_time,
               'pain': pain,
               'peristalsis': peristalsis,
               'abdominal_distention': abdominal_distention,
               'nasogastric_tube': nasogastric_tube,
               'nasogastric_reflux': nasogastric_reflux,
               'nasogastric_reflux_ph': nasogastric_reflux_ph,
               'rectal_exam_feces': rectal_exam_feces,
               'abdomen': abdomen,
               'packed_cell_volume': packed_cell_volume,
               'total_protein': total_protein,
               'abdomo_appearance': abdomo_appearance,
               'abdomo_protein': abdomo_protein,
               'surgical_lesion': surgical_lesion,
               'lesion_1': lesion_1,
               'lesion_2': lesion_2,
               'lesion_3': lesion_3,
               'cp_data': cp_data}


  features = pd.DataFrame(user_input_data, index=['0'])     ## create dataframe for user's inputs'
  return features

with open('model.pkl', 'rb') as file:                       ## load pickle model
   model = pickle.load(file)

labels = ['died', 'euthanized' , 'lived']

st.write('''Predict Health Outcomes of Horses''')
st.sidebar.header('User Input Parameters')

df = user_input_features()                   ##  read input from user

st.subheader('User Input Parameters')
st.write(df)

prediction = model.predict(df)
prediction_probabilities = model.predict_proba(df)

st.subheader('Prediction')
st.write(labels[prediction[0]])

st.subheader('Class labels and their corresponding index number')
st.write(labels)

st.subheader('Prediction Probability')
st.write(prediction_probabilities)

Overwriting app.py


In [61]:
!streamlit run /content/app.py & npx localtunnel --port 8501

[######............] \ diffTrees: sill install generateActionsToTake[0m[K
Collecting usage statistics. To deactivate, set browser.gatherUsageStats to False.
[0m
[0m
[34m[1m  You can now view your Streamlit app in your browser.[0m
[0m
[34m  Network URL: [0m[1mhttp://172.28.0.12:8501[0m
[34m  External URL: [0m[1mhttp://35.233.210.219:8501[0m
[0m
[K[?25hnpx: installed 22 in 3.545s
your url is: https://salty-towns-film.loca.lt
[34m  Stopping...[0m
^C
