# Importing Necessary Libraries

In [16]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as pt
%matplotlib inline

import warnings
warnings.filterwarnings('ignore')

In [17]:
# Loading datset

penguins = pd.read_csv('/content/penguins_data.csv').dropna()
penguins.head()

Unnamed: 0,CulmenLength,CulmenDepth,FlipperLength,BodyMass,Species
0,39.1,18.7,181.0,3750.0,0
1,39.5,17.4,186.0,3800.0,0
2,40.3,18.0,195.0,3250.0,0
4,36.7,19.3,193.0,3450.0,0
5,39.3,20.6,190.0,3650.0,0


In [18]:
# Deep learnings models work best when featurees are on similar scales

penguins['FlipperLength'] = penguins['FlipperLength']/10
penguins['BodyMass'] = penguins['BodyMass']/100


In [19]:
# The data is too small to be useful for deep learning
# So we'll oversample it to increase its size

for i in range(1,3):
  penguins = penguins.append(penguins)

sample = penguins.sample(10)
sample

Unnamed: 0,CulmenLength,CulmenDepth,FlipperLength,BodyMass,Species
156,47.6,14.5,21.5,54.0,1
178,44.5,14.3,21.6,41.0,1
292,50.3,20.0,19.7,33.0,2
145,39.0,18.7,18.5,36.5,0
127,41.5,18.3,19.5,43.0,0
39,39.8,19.1,18.4,46.5,0
220,43.5,14.2,22.0,47.0,1
279,45.4,18.7,18.8,35.25,2
280,52.7,19.8,19.7,37.25,2
300,46.7,17.9,19.5,33.0,2


* The Species column is the label our model will predict. Each label value represents a class of penguin species, encoded as 0, 1, or 2. The following code shows the actual species to which these class labels corrrespond.

In [20]:
penguin_classes = ['Adelie', 'Gentoo', 'Chinstrap']

print(sample.columns[0:5].values, 'SpeciesName')
for index, row in penguins.sample(10).iterrows():
    print('[', row[0], row[1], row[2], row[3], int(row[4]), ']', penguin_classes[int(row[-1])])

['CulmenLength' 'CulmenDepth' 'FlipperLength' 'BodyMass' 'Species'] SpeciesName
[ 48.4 14.4 20.3 46.25 1 ] Gentoo
[ 49.0 16.1 21.6 55.5 1 ] Gentoo
[ 40.6 18.6 18.3 35.5 0 ] Adelie
[ 48.5 17.5 19.1 34.0 2 ] Chinstrap
[ 49.3 19.9 20.3 40.5 2 ] Chinstrap
[ 47.2 15.5 21.5 49.75 1 ] Gentoo
[ 46.6 14.2 21.0 48.5 1 ] Gentoo
[ 53.5 19.9 20.5 45.0 2 ] Chinstrap
[ 41.1 17.5 19.0 39.0 0 ] Adelie
[ 36.4 17.1 18.4 28.5 0 ] Adelie


* As it is common in a supervised learning problem, we'll split the dataset into a set of records with which to train the model, and a smaller set with which to validate the trained model.

In [26]:
from sklearn.model_selection import train_test_split

features = ['CulmenLength','CulmenDepth','FlipperLength','BodyMass']
target = 'Species'

# Splitting the data
X_train, y_train, X_test, y_test = train_test_split(penguins[features].values, penguins[target].values,
                                                    test_size = 0.3, random_state = 0)

print(f'Training Set: {len(X_train)}\nTest Set: {len(X_test)}')
print('\nSample Of Features and Target: ')

# Take a look at first 25 training features and corresponding label
for n in range(0,24):
    print(X_train[n], y_train[n], '(' + penguin_classes[y_train[n]] + ')')

Training Set: 957
Test Set: 957

Sample Of Features and Target: 


TypeError: ignored