# Predict Health Outcomes of Horses
#### Playground Series - Season 3, Episode 22

### Description
Synthetically-Generated Datasets
Using synthetic data for Playground competitions allows us to strike a balance between having real-world data (with named features) and ensuring test labels are not publicly available. This allows us to host competitions with more interesting datasets than in the past. While there are still challenges with synthetic data generation, the state-of-the-art is much better now than when we started the Tabular Playground Series two years ago, and that goal is to produce datasets that have far fewer artifacts. Please feel free to give us feedback on the datasets for the different competitions so that we can continue to improve!

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

import warnings
warnings.filterwarnings('ignore')

In [None]:
trainData = pd.read_csv('train.csv')
trainData.head(5)


In [None]:
testData = pd.read_csv('test.csv')
testData.head(5)

In [None]:
trainData.shape,testData.shape

In [None]:
trainData.info()


## Step 2 : EDA and Data Visualization

### ON TRAIN DATASET

In [None]:
trainData.isnull().sum()

### Data is perfectly cleaned no missing values.

In [None]:
categoricallist = ['surgery','age','temp_of_extremities','peripheral_pulse','mucous_membrane','capillary_refill_time','pain','peristalsis','abdominal_distention'
                  ,'nasogastric_tube','nasogastric_reflux','rectal_exam_feces','abdomen','abdomo_appearance','surgical_lesion','cp_data','outcome']

### Univariate Analysis:

In [None]:
# # univariate Analysis on Categorical variable
# for i in categoricallist:
#     plt.figure(figsize=(8,2))
#     sns.countplot(x=trainData[i],data = trainData)
#     plt.show()

In [None]:
trainData.describe()

In [None]:
# Univariate Analysis on Numerical variable

#### we can see that lesion_1,lesion_2,lesion_3 are having outliers hence we do IQR

In [None]:
list_outlier = ['total_protein','lesion_1','lesion_2','lesion_3']
numerical_list = ['rectal_temp','pulse','respiratory_rate','nasogastric_reflux_ph','packed_cell_volume','abdomo_protein']

In [None]:
for i in list_outlier:
    trainData[i].describe(percentiles = [0.05,.25, .5, .75, .90, .95, .99])
    # As we can see there are a number of outliers in the data.
    # We will cap the outliers to 95% value for analysis.
    percentiles = trainData[i].quantile([0.05,0.95]).values
    trainData[i][trainData[i] <= percentiles[0]] = percentiles[0]
    trainData[i][trainData[i] >= percentiles[1]] = percentiles[1]

In [None]:
# for i in list_outlier:
#     plt.figure(figsize=(3,3))
#     sns.boxplot(trainData[i])
#     plt.title(trainData[i].name)
#     plt.show()

In [None]:
# for i in numerical_list:
#     plt.figure(figsize=(2,2))
#     sns.boxplot(trainData[i])
#     plt.title(trainData[i].name)
#     plt.show()

### BIVARIATE Analysis

In [None]:
# # categorical variable list with respect to target column Outcome
# for i in categoricallist:
#     sns.countplot(x=trainData[i],data=trainData,hue=trainData['outcome'])
#     plt.show()

In [None]:
# # Numerical variable list with respect to target column Outcome
# for i in numerical_list:
#     plt.figure(figsize=(17,3))
#     sns.countplot(x=trainData[i],data=trainData,hue=trainData['outcome'])
#     plt.xticks(rotation=30)
#     plt.show()

### Multivariate Analysis:

In [None]:
# plt.figure(figsize=(10,5))
# sns.heatmap(trainData.corr(),annot=True)
# plt.show()

In [None]:
# plt.figure(figsize=(20,5))
# sns.pairplot(trainData)
# plt.show()

In [None]:
# Insights:

# step3 : Data Preparation

## For TrainSet Data

In [None]:
# dropping all variables which are not producing any inpact on our model building.
trainData = trainData.drop(['hospital_number','lesion_2','lesion_3'],axis=1)
testData = testData.drop(['hospital_number','lesion_2','lesion_3'],axis=1)

In [None]:
trainData.shape,testData.shape

### creating dummy variable and droping first_element from that and create a dataframe.

In [None]:
# substituting binary value for Yes-No Variable
binary_list_var = ['surgery','surgical_lesion','cp_data']

In [None]:
def change_to_binary(x):
    x=x.map({'yes':1,'no':0})
    return x
trainData[binary_list_var] = trainData[binary_list_var].apply(change_to_binary)
testData[binary_list_var] = testData[binary_list_var].apply(change_to_binary)


In [None]:
# substitute adult = 1 , young = 0
trainData['age'] = trainData['age'].map({'adult':1,'young':0})
testData['age'] = testData['age'].map({'adult':1,'young':0})

#### Replacing in TrainData Set

In [None]:
# replacing all None with no_issues , correcting data , mergeing if necessary , as values such as normal,slight are used with many parameter
# hence append each with their root Variable name to avoid confusion in naming the variable before creating dummy vars.

trainData['temp_of_extremities'] = trainData['temp_of_extremities'].replace({'cool':'cool_temp_of_extremities','cold':'cool_temp_of_extremities','None':'no_issues','normal':'normal_temp_of_extremities','warm':'warm_temp_of_extremities'})
trainData['peripheral_pulse'] = trainData['peripheral_pulse'].replace({'None':'absent','reduced':'reduced_peripheral_pulse','normal':'normal_peripheral_pulse','increased':'increased_peripheral_pulse'})
trainData['mucous_membrane'] = trainData['mucous_membrane'].replace({'None':'no_color'})
trainData['capillary_refill_time'] = trainData['capillary_refill_time'].replace({'3':'None'})
trainData['pain'] = trainData['pain'].replace({'None':'no_pain'})
trainData['peristalsis'] = trainData['peristalsis'].replace({'None':'absent','hypomotile':'hypermotile','normal':'normal_peristalsis'})
trainData['abdominal_distention'] = trainData['abdominal_distention'].replace({'None':'nothing','none':'nothing','moderate':'moderate_abdominal_distention','slight':'slight_abdominal_distention','severe':'severe_abdominal_distention'})
trainData['nasogastric_tube'] = trainData['nasogastric_tube'].replace({'None':'no_issues','none':'no_issues','slight':'slight_nasogastric_tube','significant':'significant_nasogastric_tube'})
trainData['nasogastric_reflux'] = trainData['nasogastric_reflux'].replace({'None':'no_issues','none':'no_issues','slight':'slight_nasogastric_reflux'})
trainData['rectal_exam_feces'] = trainData['rectal_exam_feces'].replace({'None':'no_traces_rectal_exam_feces','decreased':'decreased_rectal_exam_feces','normal':'normal_rectal_exam_feces','increased':'increased_rectal_exam_feces'})
trainData['abdomen'] = trainData['abdomen'].replace({'None':'no_issues','other':'no_issues','distend_small':'distend_small_abdomen','distend_large':'distend_large_abdomen','normal':'normal_abdomen','firm':'firm_abdomen'})
trainData['abdomo_appearance'] = trainData['abdomo_appearance'].replace({'None':'no_issues','serosanguious':'serosanguious_abdomo_appearance','cloudy':'cloudy_abdomo_appearance','clear':'clear_abdomo_appearance'})

#### Replacing in TestData Set

In [None]:
# replacing all None with no_issues , correcting data , mergeing if necessary , as values such as normal,slight are used with many parameter
# hence append each with their root Variable name to avoid confusion in naming the variable before creating dummy vars.

testData['temp_of_extremities'] = testData['temp_of_extremities'].replace({'cool':'cool_temp_of_extremities','cold':'cool_temp_of_extremities','None':'no_issues','normal':'normal_temp_of_extremities','warm':'warm_temp_of_extremities'})
testData['peripheral_pulse'] = testData['peripheral_pulse'].replace({'None':'absent','reduced':'reduced_peripheral_pulse','normal':'normal_peripheral_pulse','increased':'increased_peripheral_pulse'})
testData['mucous_membrane'] = testData['mucous_membrane'].replace({'None':'no_color'})
testData['capillary_refill_time'] = testData['capillary_refill_time'].replace({'3':'None'})
testData['pain'] = testData['pain'].replace({'None':'no_pain'})
testData['peristalsis'] = testData['peristalsis'].replace({'None':'absent','hypomotile':'hypermotile','normal':'normal_peristalsis'})
testData['abdominal_distention'] = testData['abdominal_distention'].replace({'None':'nothing','none':'nothing','moderate':'moderate_abdominal_distention','slight':'slight_abdominal_distention','severe':'severe_abdominal_distention'})
testData['nasogastric_tube'] = testData['nasogastric_tube'].replace({'None':'no_issues','none':'no_issues','slight':'slight_nasogastric_tube','significant':'significant_nasogastric_tube'})
testData['nasogastric_reflux'] = testData['nasogastric_reflux'].replace({'None':'no_issues','none':'no_issues','slight':'slight_nasogastric_reflux'})
testData['rectal_exam_feces'] = testData['rectal_exam_feces'].replace({'None':'no_traces_rectal_exam_feces','decreased':'decreased_rectal_exam_feces','normal':'normal_rectal_exam_feces','increased':'increased_rectal_exam_feces'})
testData['abdomen'] = testData['abdomen'].replace({'None':'no_issues','other':'no_issues','distend_small':'distend_small_abdomen','distend_large':'distend_large_abdomen','normal':'normal_abdomen','firm':'firm_abdomen'})
testData['abdomo_appearance'] = testData['abdomo_appearance'].replace({'None':'no_issues','serosanguious':'serosanguious_abdomo_appearance','cloudy':'cloudy_abdomo_appearance','clear':'clear_abdomo_appearance'})

#### creating dummy for TrainData Set

In [None]:
#creating dummy variable for temp_of_extremities
dummy_var1 = pd.get_dummies(trainData['temp_of_extremities'])
df = pd.DataFrame(dummy_var1)
df.drop('no_issues',1,inplace=True)
trainData = pd.concat([trainData , df],axis =1)
trainData = trainData.drop('temp_of_extremities',axis=1)

#creating dummy variable for peripheral_pulse
dummy_var2 = pd.get_dummies(trainData['peripheral_pulse'])
df = pd.DataFrame(dummy_var2)
df.drop('absent',1,inplace=True)
trainData = pd.concat([trainData , df],axis =1)
trainData = trainData.drop('peripheral_pulse',axis=1)

#creating dummy variable for mucous_membrane
dummy_var3 = pd.get_dummies(trainData['mucous_membrane'])
df = pd.DataFrame(dummy_var3)
df.drop('no_color',1,inplace=True)
trainData = pd.concat([trainData , df],axis =1)
trainData = trainData.drop('mucous_membrane',axis=1)

#creating dummy variable for capillary_refill_time
dummy_var4 = pd.get_dummies(trainData['capillary_refill_time'])
df = pd.DataFrame(dummy_var4)
df.drop('None',1,inplace=True)
trainData = pd.concat([trainData , df],axis =1)
trainData = trainData.drop('capillary_refill_time',axis=1)

#creating dummy variable for pain
dummy_var5 = pd.get_dummies(trainData['pain'])
df = pd.DataFrame(dummy_var5)
df.drop('no_pain',1,inplace=True)
trainData = pd.concat([trainData , df],axis =1)
trainData = trainData.drop('pain',axis=1)

#creating dummy variable for peristalsis
dummy_var6 = pd.get_dummies(trainData['peristalsis'])
df = pd.DataFrame(dummy_var6)
df.drop('absent',1,inplace=True)
trainData = pd.concat([trainData , df],axis =1)
trainData = trainData.drop('peristalsis',axis=1)

#creating dummy variable for abdominal_distention
dummy_var7 = pd.get_dummies(trainData['abdominal_distention'])
df = pd.DataFrame(dummy_var7)
df.drop('nothing',1,inplace=True)
trainData = pd.concat([trainData , df],axis =1)
trainData = trainData.drop('abdominal_distention',axis=1)

#creating dummy variable for nasogastric_tube
dummy_var8 = pd.get_dummies(trainData['nasogastric_tube'])
df = pd.DataFrame(dummy_var8)
df = df.drop('no_issues',1,inplace=True)
trainData = pd.concat([trainData , df],axis =1)
trainData = trainData.drop('nasogastric_tube',axis=1)

#creating dummy variable for nasogastric_reflux
dummy_var9 = pd.get_dummies(trainData['nasogastric_reflux'])
df = pd.DataFrame(dummy_var9)
df = df.drop('no_issues',1,inplace=True)
trainData = pd.concat([trainData , df],axis =1)
trainData = trainData.drop('nasogastric_reflux',axis=1)

#creating dummy variable for rectal_exam_feces
dummy_var10 = pd.get_dummies(trainData['rectal_exam_feces'],drop_first=True)
trainData = pd.concat([trainData ,dummy_var10 ],axis =1)
trainData = trainData.drop('rectal_exam_feces',axis=1)

#creating dummy variable for abdomen
dummy_var11 = pd.get_dummies(trainData['abdomen'])
df = pd.DataFrame(dummy_var11)
df = df.drop('no_issues',1,inplace=True)
trainData = pd.concat([trainData , df],axis =1)
trainData = trainData.drop('abdomen',axis=1)

#creating dummy variable for abdomo_appearance
dummy_var12 = pd.get_dummies(trainData['abdomo_appearance'])
df = pd.DataFrame(dummy_var12)
df = df.drop('no_issues',1,inplace=True)
trainData = pd.concat([trainData , df],axis =1)
trainData = trainData.drop('abdomo_appearance',axis=1)

trainData.head(5)

#### creating dummy for testdata set

In [None]:
#creating dummy variable for temp_of_extremities
dummy_var1 = pd.get_dummies(testData['temp_of_extremities'])
df = pd.DataFrame(dummy_var1)
df.drop('no_issues',1,inplace=True)
testData = pd.concat([testData , df],axis =1)
testData = testData.drop('temp_of_extremities',axis=1)

#creating dummy variable for peripheral_pulse
dummy_var2 = pd.get_dummies(testData['peripheral_pulse'])
df = pd.DataFrame(dummy_var2)
df.drop('absent',1,inplace=True)
testData = pd.concat([testData , df],axis =1)
testData = testData.drop('peripheral_pulse',axis=1)

#creating dummy variable for mucous_membrane
dummy_var3 = pd.get_dummies(testData['mucous_membrane'])
df = pd.DataFrame(dummy_var3)
df.drop('no_color',1,inplace=True)
testData = pd.concat([testData , df],axis =1)
testData = testData.drop('mucous_membrane',axis=1)

#creating dummy variable for capillary_refill_time
dummy_var4 = pd.get_dummies(testData['capillary_refill_time'])
df = pd.DataFrame(dummy_var4)
df.drop('None',1,inplace=True)
testData = pd.concat([testData , df],axis =1)
testData = testData.drop('capillary_refill_time',axis=1)

#creating dummy variable for pain
dummy_var5 = pd.get_dummies(testData['pain'])
df = pd.DataFrame(dummy_var5)
df.drop('no_pain',1,inplace=True)
testData = pd.concat([testData , df],axis =1)
testData = testData.drop('pain',axis=1)

#creating dummy variable for peristalsis
dummy_var6 = pd.get_dummies(testData['peristalsis'])
df = pd.DataFrame(dummy_var6)
df.drop('absent',1,inplace=True)
testData = pd.concat([testData , df],axis =1)
testData = testData.drop('peristalsis',axis=1)

#creating dummy variable for abdominal_distention
dummy_var7 = pd.get_dummies(testData['abdominal_distention'])
df = pd.DataFrame(dummy_var7)
df.drop('nothing',1,inplace=True)
testData = pd.concat([testData , df],axis =1)
testData = testData.drop('abdominal_distention',axis=1)

#creating dummy variable for nasogastric_tube
dummy_var8 = pd.get_dummies(testData['nasogastric_tube'])
df = pd.DataFrame(dummy_var8)
df = df.drop('no_issues',1,inplace=True)
testData = pd.concat([testData , df],axis =1)
testData = testData.drop('nasogastric_tube',axis=1)

#creating dummy variable for nasogastric_reflux
dummy_var9 = pd.get_dummies(testData['nasogastric_reflux'])
df = pd.DataFrame(dummy_var9)
df = df.drop('no_issues',1,inplace=True)
testData = pd.concat([testData , df],axis =1)
testData = testData.drop('nasogastric_reflux',axis=1)

#creating dummy variable for rectal_exam_feces
dummy_var10 = pd.get_dummies(testData['rectal_exam_feces'],drop_first=True)
testData = pd.concat([testData ,dummy_var10 ],axis =1)
testData = testData.drop('rectal_exam_feces',axis=1)

#creating dummy variable for abdomen
dummy_var11 = pd.get_dummies(testData['abdomen'])
df = pd.DataFrame(dummy_var11)
df = df.drop('no_issues',1,inplace=True)
testData = pd.concat([testData , df],axis =1)
testData = testData.drop('abdomen',axis=1)

#creating dummy variable for abdomo_appearance
dummy_var12 = pd.get_dummies(testData['abdomo_appearance'])
df = pd.DataFrame(dummy_var12)
df = df.drop('no_issues',1,inplace=True)
testData = pd.concat([testData , df],axis =1)
testData = testData.drop('abdomo_appearance',axis=1)

testData.head(5)

In [None]:
trainData.shape , testData.shape

# Step 4 Creating X and Y on Traing Data Set and Scaling All Features

In [None]:
X_train = trainData

In [None]:
# Putting feature variable to X
X = X_train.drop(['outcome','distend_small','slight','serosanguious'],axis=1)
y = trainData['outcome']

In [None]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

In [None]:
X_train

In [None]:
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()

# X_train[['rectal_temp','pulse','respiratory_rate','nasogastric_reflux_ph','packed_cell_volume','abdomo_protein']] = scaler.fit_transform(X_train[['rectal_temp','pulse','respiratory_rate','nasogastric_reflux_ph','packed_cell_volume','abdomo_protein']])
# X_test[['rectal_temp','pulse','respiratory_rate','nasogastric_reflux_ph','packed_cell_volume','abdomo_protein']] = scaler.transform(X_test[['rectal_temp','pulse','respiratory_rate','nasogastric_reflux_ph','packed_cell_volume','abdomo_protein']])

# X_train.head()

X_train_scaled = scaler.fit_transform(X_train)
# X_test_scaled = scaler.transform(X_test)


# Step 7: Model Building
#### Let's start by splitting our data into a training set and a test set.

#### Running Your First Training Model

In [None]:
from sklearn.linear_model import LogisticRegression
from sklearn.multiclass import OneVsRestClassifier, OneVsOneClassifier
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score, classification_report
import statsmodels.api as sm
from sklearn.tree import DecisionTreeClassifier
import warnings

In [None]:
lr = LogisticRegression()

In [None]:
lr.fit(X_train,y_train)

In [None]:
lr.predict(X_test)

In [None]:
X_test

In [None]:
lr.score(X_test,y_test)

In [None]:
pred_outcome = lr.predict(X_test)
pred_outcome

In [None]:
cm = confusion_matrix(y_test,pred_outcome)
cm

In [None]:
metrics.accuracy_score(y_test,pred_outcome)

In [None]:
test_data = testData

In [None]:
test_data = test_data.drop('moderate',1)

In [None]:
f = lr.predict(test_data)

In [None]:
metrics.accuracy_score(f)