In [1]:
#Importing libraries
import pandas as pd 
import numpy as np
import seaborn as sns
from sklearn.model_selection import train_test_split

In [2]:
#Loading Dataset
data=sns.load_dataset('penguins')
data.head()

Unnamed: 0,species,island,bill_length_mm,bill_depth_mm,flipper_length_mm,body_mass_g,sex
0,Adelie,Torgersen,39.1,18.7,181.0,3750.0,Male
1,Adelie,Torgersen,39.5,17.4,186.0,3800.0,Female
2,Adelie,Torgersen,40.3,18.0,195.0,3250.0,Female
3,Adelie,Torgersen,,,,,
4,Adelie,Torgersen,36.7,19.3,193.0,3450.0,Female


In [3]:
data.shape

(344, 7)

In [4]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 344 entries, 0 to 343
Data columns (total 7 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   species            344 non-null    object 
 1   island             344 non-null    object 
 2   bill_length_mm     342 non-null    float64
 3   bill_depth_mm      342 non-null    float64
 4   flipper_length_mm  342 non-null    float64
 5   body_mass_g        342 non-null    float64
 6   sex                333 non-null    object 
dtypes: float64(4), object(3)
memory usage: 18.9+ KB


In [5]:
data.isnull().sum()

species               0
island                0
bill_length_mm        2
bill_depth_mm         2
flipper_length_mm     2
body_mass_g           2
sex                  11
dtype: int64

In [6]:
data.dropna(inplace=True)

In [7]:
data.isnull().sum()

species              0
island               0
bill_length_mm       0
bill_depth_mm        0
flipper_length_mm    0
body_mass_g          0
sex                  0
dtype: int64

In [8]:
data.sex.unique()

array(['Male', 'Female'], dtype=object)

In [9]:
data.sex.value_counts()

Male      168
Female    165
Name: sex, dtype: int64

In [10]:
data.species.unique()

array(['Adelie', 'Chinstrap', 'Gentoo'], dtype=object)

In [11]:
data.species.value_counts()

Adelie       146
Gentoo       119
Chinstrap     68
Name: species, dtype: int64

In [12]:
data.island.unique()

array(['Torgersen', 'Biscoe', 'Dream'], dtype=object)

In [13]:
data.island.value_counts()

Biscoe       163
Dream        123
Torgersen     47
Name: island, dtype: int64

#This is one hot encoder using get_dummies method that can create columns for unique values and transform that values into numeric

In [14]:
#pd.get_dummies(data['sex']).head()

In [15]:
#sex=pd.get_dummies(data['sex'], drop_first=True)
#sex.head()

In [16]:
data['sex']=data['sex'].astype('category')
data['sex']=data['sex'].cat.codes
data['species']=data['species'].astype('category')
data['species']=data['species'].cat.codes
data['island']=data['island'].astype('category')
data['island']=data['island'].cat.codes

In [17]:
data.head()

Unnamed: 0,species,island,bill_length_mm,bill_depth_mm,flipper_length_mm,body_mass_g,sex
0,0,2,39.1,18.7,181.0,3750.0,1
1,0,2,39.5,17.4,186.0,3800.0,0
2,0,2,40.3,18.0,195.0,3250.0,0
4,0,2,36.7,19.3,193.0,3450.0,0
5,0,2,39.3,20.6,190.0,3650.0,1


In [18]:
data.sex.value_counts()

1    168
0    165
Name: sex, dtype: int64

1-male, 0-female

In [19]:
data.species.value_counts()

0    146
2    119
1     68
Name: species, dtype: int64

Adelie-0
Gentoo-2
Chinstrap-1

In [20]:
data.island.value_counts()

0    163
1    123
2     47
Name: island, dtype: int64

Biscoe-0
Dream-1
Torgersen-2

In [21]:
data.head()

Unnamed: 0,species,island,bill_length_mm,bill_depth_mm,flipper_length_mm,body_mass_g,sex
0,0,2,39.1,18.7,181.0,3750.0,1
1,0,2,39.5,17.4,186.0,3800.0,0
2,0,2,40.3,18.0,195.0,3250.0,0
4,0,2,36.7,19.3,193.0,3450.0,0
5,0,2,39.3,20.6,190.0,3650.0,1


#Independent & dependent variables

In [22]:
X=data.iloc[:,1:]
y=data['species']

In [23]:
X

Unnamed: 0,island,bill_length_mm,bill_depth_mm,flipper_length_mm,body_mass_g,sex
0,2,39.1,18.7,181.0,3750.0,1
1,2,39.5,17.4,186.0,3800.0,0
2,2,40.3,18.0,195.0,3250.0,0
4,2,36.7,19.3,193.0,3450.0,0
5,2,39.3,20.6,190.0,3650.0,1
...,...,...,...,...,...,...
338,0,47.2,13.7,214.0,4925.0,0
340,0,46.8,14.3,215.0,4850.0,0
341,0,50.4,15.7,222.0,5750.0,1
342,0,45.2,14.8,212.0,5200.0,0


In [24]:
y

0      0
1      0
2      0
4      0
5      0
      ..
338    2
340    2
341    2
342    2
343    2
Name: species, Length: 333, dtype: int8

#Splitting the dataset using train_test_split

In [25]:
X_train,X_test,y_train,y_test=train_test_split(X,y,test_size=0.3,random_state=0)

In [26]:
print('X_train', X_train.shape)
print('X_test', X_test.shape)
print('y_train', y_train.shape)
print('y_test', y_test.shape)

X_train (233, 6)
X_test (100, 6)
y_train (233,)
y_test (100,)


#Applying Random Forest Classification on training set with criterion as Entropy

In [27]:
from sklearn.ensemble import RandomForestClassifier

In [28]:
num_trees=5
model=RandomForestClassifier(n_estimators=num_trees,criterion='entropy',random_state=0)
model.fit(X_train,y_train)

In [29]:
#checking accuracy training data
y_pred_train = model.predict(X_train)
pd.Series(y_pred_train).value_counts()

0    97
2    83
1    53
dtype: int64

In [30]:
from sklearn import metrics
from sklearn.metrics import accuracy_score
metrics.accuracy_score(y_pred_train,y_train)

0.9957081545064378

In [31]:
#checking accuracy testing data
y_pred_test = model.predict(X_test)
pd.Series(y_pred_test).value_counts()

0    50
2    36
1    14
dtype: int64

In [32]:
metrics.accuracy_score(y_pred_test,y_test)

0.98

In [33]:
from sklearn.metrics import classification_report
print(classification_report(y_test,y_pred_test))

              precision    recall  f1-score   support

           0       0.96      1.00      0.98        48
           1       1.00      0.88      0.93        16
           2       1.00      1.00      1.00        36

    accuracy                           0.98       100
   macro avg       0.99      0.96      0.97       100
weighted avg       0.98      0.98      0.98       100



In [34]:
from sklearn.metrics import confusion_matrix
confusion_matrix(y_test,y_pred_test)

array([[48,  0,  0],
       [ 2, 14,  0],
       [ 0,  0, 36]])

#Applying Random Forest Classification on training set with criterion as Gini

In [35]:
from sklearn.ensemble import RandomForestClassifier

In [36]:
num_trees=7
model1=RandomForestClassifier(n_estimators=num_trees,criterion='gini',random_state=0)
model1.fit(X_train,y_train)

In [37]:
#checking accuracy training data
y_pred_train1 = model1.predict(X_train)
pd.Series(y_pred_train1).value_counts()

0    98
2    83
1    52
dtype: int64

In [38]:
metrics.accuracy_score(y_pred_train,y_train)

0.9957081545064378

In [39]:
#checking accuracy testing data
y_pred_test1 = model1.predict(X_test)
pd.Series(y_pred_test1).value_counts()

0    50
2    36
1    14
dtype: int64

In [40]:
metrics.accuracy_score(y_pred_test1,y_test)

0.98

In [41]:
print(classification_report(y_test,y_pred_test1))

              precision    recall  f1-score   support

           0       0.96      1.00      0.98        48
           1       1.00      0.88      0.93        16
           2       1.00      1.00      1.00        36

    accuracy                           0.98       100
   macro avg       0.99      0.96      0.97       100
weighted avg       0.98      0.98      0.98       100



In [42]:
from sklearn.metrics import confusion_matrix
confusion_matrix(y_test,y_pred_test1)

array([[48,  0,  0],
       [ 2, 14,  0],
       [ 0,  0, 36]])