In [1]:
import pandas as pd
import numpy as np

In [2]:
df = pd.read_csv("zoo.csv")
df.head()

Unnamed: 0,animal_name,hair,feathers,eggs,milk,airborne,aquatic,predator,toothed,backbone,breathes,venomous,fins,legs,tail,domestic,catsize,class_type
0,aardvark,1,0,0,1,0,0,1,1,1,1,0,0,4,0,0,1,1
1,antelope,1,0,0,1,0,0,0,1,1,1,0,0,4,1,0,1,1
2,bass,0,0,1,0,0,1,1,1,1,0,0,1,0,1,0,0,4
3,bear,1,0,0,1,0,0,1,1,1,1,0,0,4,0,0,1,1
4,boar,1,0,0,1,0,0,1,1,1,1,0,0,4,1,0,1,1


Our objective is to classify the class of animals based on given attributes

Here, we can drop 'animal_name' feature as our objective is set i.e we will first identify the class and then through that class we will guess which animals come under this class.

In [3]:
df.drop('animal_name',axis=1,inplace=True)

In [4]:
df.head()

Unnamed: 0,hair,feathers,eggs,milk,airborne,aquatic,predator,toothed,backbone,breathes,venomous,fins,legs,tail,domestic,catsize,class_type
0,1,0,0,1,0,0,1,1,1,1,0,0,4,0,0,1,1
1,1,0,0,1,0,0,0,1,1,1,0,0,4,1,0,1,1
2,0,0,1,0,0,1,1,1,1,0,0,1,0,1,0,0,4
3,1,0,0,1,0,0,1,1,1,1,0,0,4,0,0,1,1
4,1,0,0,1,0,0,1,1,1,1,0,0,4,1,0,1,1


In [6]:
df['class_type'].value_counts()

1    41
2    20
4    13
7    10
6     8
3     5
5     4
Name: class_type, dtype: int64

In [8]:
df.shape

(101, 17)

so we have 101 records. Also the count for some classes are very less. So we will first handle imbalanced data

In [13]:
df['class_type'].value_counts()/len(df)

1    0.405941
2    0.198020
4    0.128713
7    0.099010
6    0.079208
3    0.049505
5    0.039604
Name: class_type, dtype: float64

In [16]:
# We will save this as weights for the classes and will pass it to the model
weights = df['class_type'].value_counts()/len(df)
dict(weights)

{1: 0.40594059405940597,
 2: 0.19801980198019803,
 4: 0.12871287128712872,
 7: 0.09900990099009901,
 6: 0.07920792079207921,
 3: 0.04950495049504951,
 5: 0.039603960396039604}

In [18]:
main_df = df.copy()

As there is less data, we will use cross-validation instead od train-test split

In [19]:
y = main_df.pop('class_type')
X = main_df

In [20]:
from sklearn.model_selection import cross_val_score

We will use RandomForestClassifier as it would work well with such kind of data

In [21]:
from sklearn.ensemble import RandomForestClassifier

In [24]:
rf_model = RandomForestClassifier(class_weight=dict(weights))

In [29]:
score = cross_val_score(estimator=rf_model,
                X=X,
                y=y,
                scoring='accuracy',
                cv = 3,
                n_jobs=-1)

In [30]:
score.mean()

0.960190136660725

In [42]:
X.iloc[2,:]

hair        0
feathers    0
eggs        1
milk        0
airborne    0
aquatic     1
predator    1
toothed     1
backbone    1
breathes    0
venomous    0
fins        1
legs        0
tail        1
domestic    0
catsize     0
Name: 2, dtype: int64

In [39]:
rf_model.fit(X,y)

RandomForestClassifier(class_weight={1: 0.40594059405940597,
                                     2: 0.19801980198019803,
                                     3: 0.04950495049504951,
                                     4: 0.12871287128712872,
                                     5: 0.039603960396039604,
                                     6: 0.07920792079207921,
                                     7: 0.09900990099009901})

So we got about 96 percent accuracy..

In [31]:
#We can test our model now using examples

In [43]:
rf_model.predict([[0,0,1,0,0,1,1,1,1,0,0,1,0,1,0,0]])



array([4], dtype=int64)

Our model is predicting fine. We will save our model now..

In [44]:
import pickle

In [45]:
pickle.dump(rf_model, open('model.pkl','wb'))