In [46]:
import numpy as np 
import pandas as pd
import seaborn as sns


In [47]:
df = pd.read_csv('penguins_size.csv')
df.head()

Unnamed: 0,species,island,culmen_length_mm,culmen_depth_mm,flipper_length_mm,body_mass_g,sex
0,Adelie,Torgersen,39.1,18.7,181.0,3750.0,MALE
1,Adelie,Torgersen,39.5,17.4,186.0,3800.0,FEMALE
2,Adelie,Torgersen,40.3,18.0,195.0,3250.0,FEMALE
3,Adelie,Torgersen,,,,,
4,Adelie,Torgersen,36.7,19.3,193.0,3450.0,FEMALE


In [48]:
df.shape

(344, 7)

In [49]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 344 entries, 0 to 343
Data columns (total 7 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   species            344 non-null    object 
 1   island             344 non-null    object 
 2   culmen_length_mm   342 non-null    float64
 3   culmen_depth_mm    342 non-null    float64
 4   flipper_length_mm  342 non-null    float64
 5   body_mass_g        342 non-null    float64
 6   sex                334 non-null    object 
dtypes: float64(4), object(3)
memory usage: 18.9+ KB


In [50]:
df.isnull().sum()

species               0
island                0
culmen_length_mm      2
culmen_depth_mm       2
flipper_length_mm     2
body_mass_g           2
sex                  10
dtype: int64

In [51]:
df.dropna(inplace=True)

In [52]:
df.isnull().sum()

species              0
island               0
culmen_length_mm     0
culmen_depth_mm      0
flipper_length_mm    0
body_mass_g          0
sex                  0
dtype: int64

In [53]:
df.sex.unique()

array(['MALE', 'FEMALE', '.'], dtype=object)

In [54]:
pd.get_dummies(df['sex']).astype(int).head()

Unnamed: 0,.,FEMALE,MALE
0,0,0,1
1,0,1,0
2,0,1,0
4,0,1,0
5,0,0,1


In [55]:
df['sex'].value_counts()

sex
MALE      168
FEMALE    165
.           1
Name: count, dtype: int64

In [56]:
df['sex'] = df['sex'].replace('.', 'MALE')

In [57]:
df['sex'].value_counts()

sex
MALE      169
FEMALE    165
Name: count, dtype: int64

In [58]:
sex = pd.get_dummies(df['sex'],drop_first=True).astype(int)
sex.head()

Unnamed: 0,MALE
0,1
1,0
2,0
4,0
5,1


In [59]:
df['island'].unique()

array(['Torgersen', 'Biscoe', 'Dream'], dtype=object)

In [60]:
island = pd.get_dummies(df['island'],drop_first=True)
island.head()

Unnamed: 0,Dream,Torgersen
0,False,True
1,False,True
2,False,True
4,False,True
5,False,True


In [61]:
new_df = pd.concat([df,island,sex],axis = 1)
new_df.head()

Unnamed: 0,species,island,culmen_length_mm,culmen_depth_mm,flipper_length_mm,body_mass_g,sex,Dream,Torgersen,MALE
0,Adelie,Torgersen,39.1,18.7,181.0,3750.0,MALE,False,True,1
1,Adelie,Torgersen,39.5,17.4,186.0,3800.0,FEMALE,False,True,0
2,Adelie,Torgersen,40.3,18.0,195.0,3250.0,FEMALE,False,True,0
4,Adelie,Torgersen,36.7,19.3,193.0,3450.0,FEMALE,False,True,0
5,Adelie,Torgersen,39.3,20.6,190.0,3650.0,MALE,False,True,1


In [62]:
new_df.drop(['sex','island'],axis=1,inplace=True)
new_df.head()

Unnamed: 0,species,culmen_length_mm,culmen_depth_mm,flipper_length_mm,body_mass_g,Dream,Torgersen,MALE
0,Adelie,39.1,18.7,181.0,3750.0,False,True,1
1,Adelie,39.5,17.4,186.0,3800.0,False,True,0
2,Adelie,40.3,18.0,195.0,3250.0,False,True,0
4,Adelie,36.7,19.3,193.0,3450.0,False,True,0
5,Adelie,39.3,20.6,190.0,3650.0,False,True,1


In [63]:
Y = new_df.species
Y.head()

0    Adelie
1    Adelie
2    Adelie
4    Adelie
5    Adelie
Name: species, dtype: object

In [64]:
Y.unique()

array(['Adelie', 'Chinstrap', 'Gentoo'], dtype=object)

In [65]:
Y =Y.map({'Adelie':0,'Chinstrap':1,'Gentoo':2})
Y.head()

0    0
1    0
2    0
4    0
5    0
Name: species, dtype: int64

In [66]:
new_df.drop('species',inplace=True,axis=1)

In [67]:
new_df.head()

Unnamed: 0,culmen_length_mm,culmen_depth_mm,flipper_length_mm,body_mass_g,Dream,Torgersen,MALE
0,39.1,18.7,181.0,3750.0,False,True,1
1,39.5,17.4,186.0,3800.0,False,True,0
2,40.3,18.0,195.0,3250.0,False,True,0
4,36.7,19.3,193.0,3450.0,False,True,0
5,39.3,20.6,190.0,3650.0,False,True,1


In [68]:
from sklearn.model_selection import train_test_split
X_train,X_test,Y_train,Y_test = train_test_split(new_df,Y,test_size=0.3,random_state=0)


In [69]:
from sklearn.ensemble import RandomForestClassifier
clf = RandomForestClassifier(n_estimators=5,criterion='entropy',random_state=0)
clf.fit(X_train,Y_train)

In [71]:
y_pred = clf.predict(X_test)


In [77]:
from sklearn.metrics import confusion_matrix,classification_report,accuracy_score


In [78]:
print(confusion_matrix(Y_test,y_pred))
print(accuracy_score(Y_test,y_pred))
print(classification_report(Y_test,y_pred))

[[50  0  0]
 [ 2 16  0]
 [ 0  0 33]]
0.9801980198019802
              precision    recall  f1-score   support

           0       0.96      1.00      0.98        50
           1       1.00      0.89      0.94        18
           2       1.00      1.00      1.00        33

    accuracy                           0.98       101
   macro avg       0.99      0.96      0.97       101
weighted avg       0.98      0.98      0.98       101



## Trying with different no of trees and gini  criteria

In [82]:
clf = RandomForestClassifier(n_estimators=12,criterion='gini',random_state=0)
clf.fit(X_train,Y_train)

In [83]:
y_predict = clf.predict(X_test)
accuracy_score(Y_test,y_predict)

0.9801980198019802