# Decision Tree Vs Random Forest on Horse Survival data

In [1]:
### Loading the required libraries...

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline
import sklearn

In [2]:
### loading the dataset...
df= pd.read_csv('horse.csv')
df.head()

Unnamed: 0,surgery,age,hospital_number,rectal_temp,pulse,respiratory_rate,temp_of_extremities,peripheral_pulse,mucous_membrane,capillary_refill_time,...,packed_cell_volume,total_protein,abdomo_appearance,abdomo_protein,outcome,surgical_lesion,lesion_1,lesion_2,lesion_3,cp_data
0,no,adult,530101,38.5,66.0,28.0,cool,reduced,,more_3_sec,...,45.0,8.4,,,died,no,11300,0,0,no
1,yes,adult,534817,39.2,88.0,20.0,,,pale_cyanotic,less_3_sec,...,50.0,85.0,cloudy,2.0,euthanized,no,2208,0,0,no
2,no,adult,530334,38.3,40.0,24.0,normal,normal,pale_pink,less_3_sec,...,33.0,6.7,,,lived,no,0,0,0,yes
3,yes,young,5290409,39.1,164.0,84.0,cold,normal,dark_cyanotic,more_3_sec,...,48.0,7.2,serosanguious,5.3,died,yes,2208,0,0,yes
4,no,adult,530255,37.3,104.0,35.0,,,dark_cyanotic,more_3_sec,...,74.0,7.4,,,died,no,4300,0,0,no


In [3]:
### checking the dimensions of the dataset...
df.shape

(299, 28)

In [4]:
#### getting more info on the variables....
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 299 entries, 0 to 298
Data columns (total 28 columns):
 #   Column                 Non-Null Count  Dtype  
---  ------                 --------------  -----  
 0   surgery                299 non-null    object 
 1   age                    299 non-null    object 
 2   hospital_number        299 non-null    int64  
 3   rectal_temp            239 non-null    float64
 4   pulse                  275 non-null    float64
 5   respiratory_rate       241 non-null    float64
 6   temp_of_extremities    243 non-null    object 
 7   peripheral_pulse       230 non-null    object 
 8   mucous_membrane        252 non-null    object 
 9   capillary_refill_time  267 non-null    object 
 10  pain                   244 non-null    object 
 11  peristalsis            255 non-null    object 
 12  abdominal_distention   243 non-null    object 
 13  nasogastric_tube       195 non-null    object 
 14  nasogastric_reflux     193 non-null    object 
 15  nasoga

In [5]:
#### Getting a statistical summary of the dataset...
df.describe()

Unnamed: 0,hospital_number,rectal_temp,pulse,respiratory_rate,nasogastric_reflux_ph,packed_cell_volume,total_protein,abdomo_protein,lesion_1,lesion_2,lesion_3
count,299.0,239.0,275.0,241.0,53.0,270.0,266.0,101.0,299.0,299.0,299.0
mean,1087733.0,38.168619,72.0,30.460581,4.707547,46.307407,24.274436,3.039604,3659.70903,90.528428,7.38796
std,1532032.0,0.733744,28.646219,17.666102,1.982311,10.436743,27.364194,1.967947,5408.472421,650.637139,127.749768
min,518476.0,35.4,30.0,8.0,1.0,23.0,3.3,0.1,0.0,0.0,0.0
25%,528904.0,37.8,48.0,18.0,3.0,38.0,6.5,2.0,2111.5,0.0,0.0
50%,530301.0,38.2,64.0,25.0,5.0,45.0,7.5,2.3,2322.0,0.0,0.0
75%,534736.0,38.5,88.0,36.0,6.5,52.0,56.75,3.9,3209.0,0.0,0.0
max,5305629.0,40.8,184.0,96.0,7.5,75.0,89.0,10.1,41110.0,7111.0,2209.0


In [6]:
### listing the columns (variables)
df.columns

Index(['surgery', 'age', 'hospital_number', 'rectal_temp', 'pulse',
       'respiratory_rate', 'temp_of_extremities', 'peripheral_pulse',
       'mucous_membrane', 'capillary_refill_time', 'pain', 'peristalsis',
       'abdominal_distention', 'nasogastric_tube', 'nasogastric_reflux',
       'nasogastric_reflux_ph', 'rectal_exam_feces', 'abdomen',
       'packed_cell_volume', 'total_protein', 'abdomo_appearance',
       'abdomo_protein', 'outcome', 'surgical_lesion', 'lesion_1', 'lesion_2',
       'lesion_3', 'cp_data'],
      dtype='object')

In [7]:
### Extracting the target variable ('outcome') from the dataset...
target = df['outcome']
target.unique()

array(['died', 'euthanized', 'lived'], dtype=object)

In [8]:
### Dropping target variable from the dataset...
df.drop(['outcome'],axis = 1, inplace = True)
df.head()

Unnamed: 0,surgery,age,hospital_number,rectal_temp,pulse,respiratory_rate,temp_of_extremities,peripheral_pulse,mucous_membrane,capillary_refill_time,...,abdomen,packed_cell_volume,total_protein,abdomo_appearance,abdomo_protein,surgical_lesion,lesion_1,lesion_2,lesion_3,cp_data
0,no,adult,530101,38.5,66.0,28.0,cool,reduced,,more_3_sec,...,distend_large,45.0,8.4,,,no,11300,0,0,no
1,yes,adult,534817,39.2,88.0,20.0,,,pale_cyanotic,less_3_sec,...,other,50.0,85.0,cloudy,2.0,no,2208,0,0,no
2,no,adult,530334,38.3,40.0,24.0,normal,normal,pale_pink,less_3_sec,...,normal,33.0,6.7,,,no,0,0,0,yes
3,yes,young,5290409,39.1,164.0,84.0,cold,normal,dark_cyanotic,more_3_sec,...,,48.0,7.2,serosanguious,5.3,yes,2208,0,0,yes
4,no,adult,530255,37.3,104.0,35.0,,,dark_cyanotic,more_3_sec,...,,74.0,7.4,,,no,4300,0,0,no


In [9]:
### checking for null values....
df.isnull().sum()
Null = df.isnull().sum()
print(Null[Null>0])

rectal_temp               60
pulse                     24
respiratory_rate          58
temp_of_extremities       56
peripheral_pulse          69
mucous_membrane           47
capillary_refill_time     32
pain                      55
peristalsis               44
abdominal_distention      56
nasogastric_tube         104
nasogastric_reflux       106
nasogastric_reflux_ph    246
rectal_exam_feces        102
abdomen                  118
packed_cell_volume        29
total_protein             33
abdomo_appearance        165
abdomo_protein           198
dtype: int64


In [10]:
#### Converting categorical variables to dummy variables

cat_var = ['surgery','age','temp_of_extremities','peripheral_pulse','mucous_membrane','capillary_refill_time','pain',
             'peristalsis','nasogastric_tube','nasogastric_reflux','rectal_exam_feces','abdominal_distention','abdomen',
             'abdomo_appearance','surgical_lesion','cp_data']

for category in cat_var:
    df[category] = pd.get_dummies(df[category])

In [11]:
### Replacing null values with mode for categorivcal variables...
for i in df[category]:
    df[category].replace(np.nan, df[category].mode()[0], inplace = True)

In [12]:
con_var = ['hospital_number', 'rectal_temp','pulse','respiratory_rate','nasogastric_reflux_ph','packed_cell_volume',
           'total_protein','abdomo_protein','lesion_1','lesion_2','lesion_3']

In [13]:
### replacing null values withmedian for continous variables
for i in con_var:
    df[i].replace(np.nan,df[i].median(), inplace = True)

In [16]:
df.isnull().sum()

surgery                  0
age                      0
hospital_number          0
rectal_temp              0
pulse                    0
respiratory_rate         0
temp_of_extremities      0
peripheral_pulse         0
mucous_membrane          0
capillary_refill_time    0
pain                     0
peristalsis              0
abdominal_distention     0
nasogastric_tube         0
nasogastric_reflux       0
nasogastric_reflux_ph    0
rectal_exam_feces        0
abdomen                  0
packed_cell_volume       0
total_protein            0
abdomo_appearance        0
abdomo_protein           0
surgical_lesion          0
lesion_1                 0
lesion_2                 0
lesion_3                 0
cp_data                  0
dtype: int64

In [17]:
### Importing train_test_split  and LabelEncoder from sklearn ..... 
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder

In [18]:
### Creating x(predictor ) nad y(target) variables...
x = df.values
y = target.values

In [19]:
lab = LabelEncoder()
y = lab.fit_transform(y)

In [20]:
#### splitting the dataset .....
xtrain,xtest,ytrain,ytest  = train_test_split(x,y,test_size = 0.2, random_state = 1)
xtrain.shape

(239, 27)

# Decision Tree Classifier

In [21]:
### Importing  DecisionTreeClassifier from sklearn..
from sklearn.tree import DecisionTreeClassifier

In [22]:
### creating a classifier instance...
classifier = DecisionTreeClassifier()

In [23]:
### fitting the data onto the classifier...
classifier.fit(xtrain,ytrain)

DecisionTreeClassifier()

In [24]:
#### prediction
ypred = classifier.predict(xtest)

In [25]:
from sklearn.metrics import accuracy_score

In [26]:
Accuracy = accuracy_score(ypred,ytest)

In [27]:
### model accuracy
print(Accuracy)

0.6666666666666666


# Random Forest Classifier

In [28]:
### Importing random forest
from sklearn.ensemble import RandomForestClassifier 


In [29]:
### creating a classifier instance...
random = RandomForestClassifier()

In [30]:
### fitting the data onto the classifier...
random.fit(xtrain,ytrain)


RandomForestClassifier()

In [32]:
#### prediction
ypred = random.predict(xtest)

In [33]:
Accuracy = accuracy_score(ypred,ytest)

In [34]:
### model accuracy
print(Accuracy)

0.7166666666666667


In [None]:
# Hence "RandomForestClassifier" has  a better prediction accuracy as compared to "DecisionTreeClassifier"