In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline
import plotly.express as px
from sklearn.preprocessing import StandardScaler
from sklearn.cluster import KMeans



In [None]:
data = pd.read_csv('../input/palmer-archipelago-antarctica-penguin-data/penguins_size.csv')

In [None]:
data.head()

In [None]:
data.shape

In [None]:
data.columns

In [None]:
data.info()

The dataset consists of 7 columns.

* species: penguin species (Chinstrap, Adélie, or Gentoo)
* culmen_length_mm: culmen length 
* culmen_depth_mm: culmen depth 
* flipper_length_mm: flipper length 
* body_mass_g: body mass 
* island: island name (Dream, Torgersen, or Biscoe) 
* sex: penguin sex

**culmen length & depth :**

The culmen is the upper ridge of a bird's beak

![](https://image.shutterstock.com/z/stock-vector-diagram-showing-parts-of-penguin-illustration-461548348.jpg)

In [None]:
# Different types of species we have
# Number of data points we have for each species.
print("Number of data points in each species")
data['species'].value_counts().plot(kind = 'bar')
data['species'].value_counts()

'Adelie' has most datapoints

In [None]:
# Checking the percentage of missing values
print("Missing Values")
100*data.isnull().sum()/len(data)

***We have 5 columns which have missing values out of which 1 is categorical feature rest can be imputed** 

# Columns to be imputed

* culmen_length_mm
* culmen_depth_mm
* flipper_length_mm
* body_mass_g

In [None]:
col = ['culmen_depth_mm','culmen_length_mm','flipper_length_mm','body_mass_g']
for column in col:
    data[column].fillna(data[column].median(),inplace = True)

In [None]:
data['sex'] = data['sex'].fillna('MALE')

In [None]:
sns.set_style('whitegrid')
sns.FacetGrid(data, hue ="species", size =4)\
   .map(plt.scatter,"culmen_length_mm","culmen_depth_mm")\
   .add_legend();
plt.show()

# **Observations**

* Using culmen_depth_mm and culmen_length_mm features, we can distinguish Adeline from others.
* Seperating Chinstrap and Gentoo is a bit harder as they have some overlap points.

In [None]:
plt.close();
sns.set_style("whitegrid");
sns.pairplot(data, hue='species', size=4);
plt.show()

# **Observations**

* culmen_length_mm and flipper_length_mm are the most useful features to identify various types of penguins
* Gentoo can easily identified(linear seperable), Adeline and Chinastrap are quite hard as they have some overlapping points.

In [None]:
sns.boxplot(x = 'species', y='culmen_length_mm', data = data)
plt.show()

In [None]:
sns.violinplot(x='species', y = 'culmen_depth_mm', data= data)
plt.show()

In [None]:
sns.swarmplot(x='species', y='flipper_length_mm', data = data)
plt.show()

In [None]:
sns.FacetGrid(data, hue="species", height=6,)\
   .map(sns.kdeplot, "body_mass_g",shade=True)\
   .add_legend()
plt.show()

# Multivariate probability density, contour plot

In [None]:
sns.jointplot(x="culmen_length_mm", y="flipper_length_mm",data = data, kind="kde", height=7, space=0)

In [None]:
sns.catplot(x="species", y="culmen_depth_mm", hue="sex", data=data,
                height=6, kind="bar", palette="muted")

# Observation

* In all the three species Male penguins are more than females and chinstrap species have highest culmen_depth.
* The important observation which we can see is that there is three types of sex in Gentoo species, the third species is a '.' which could have entered by mistake. Let's look into it and see what we can do with it.

In [None]:
data[data['sex']=='.']

Let's take a look into island feature to check which type of sex dominates in Biscoe island so that we can change this particular value to that.

In [None]:
sns.catplot(x="island", y="culmen_length_mm", hue="sex", data=data,
                height=6, kind="bar", palette="muted")

Male is the gender which dominates in both island feature and species feature therefore I will be replacing it with Male only.

In [None]:
data.loc[336,'sex'] = 'MALE'

In [None]:
sns.catplot(x="species", y="culmen_length_mm", hue="sex", data=data,
                height=6, kind="bar", palette="muted")

# Observation

Chinstrap have highest culmen length in both male and female sex

In [None]:
sns.catplot(x="species", y="culmen_depth_mm", hue="sex", data=data,
                height=6, kind="bar", palette="muted")

# Observation

In male category chinstrap have highest culmen depth

In Female category there is a fight between Adelie and Chinstrap but I think Adelie is winner

In [None]:
sns.catplot(x="species", y="flipper_length_mm", hue="sex", data=data,
                height=6, kind="bar", palette="muted")

# Observation

Gentoo have highest flipper length in both category

In [None]:
sns.catplot(x="species", y="body_mass_g", hue="sex", data=data,
                height=6, kind="bar", palette="muted")

# Observation

Gentoo have highest body weight in both male and female

In [None]:
fig = sns.barplot(data= data['island'].value_counts().reset_index(), x='island', y='index')
fig.set(xlabel='', ylabel='ISLANDS')
plt.show()

# Observation

Biscoe Island contains maximum number of penguins

# Island contains which species and how many

In [None]:
# Total number of species 
data.species.value_counts()

Let's First go with Biscoe as it contains maximum number of penguins

In [None]:
df = data[data.island=='Biscoe']
print(df.species.value_counts())
df.species.value_counts().plot(kind='bar')

# Observation

There are in total 344 penguins and out of which

124 are Gentoo species
68 Chinstrap
152 Adelie

According to the above graph and stats it's clear that **all the Gentoo penguins are in Biscoe island**

And there is no Chinstrap penguins in Biscoe island

# Let's proceed with Dream Island

In [None]:
df = data[data.island=='Dream']
print(df.species.value_counts())
df.species.value_counts().plot(kind='bar')

# Observation

**All the Chinstrap penguins live in Dream island and there is no Gentoo penguins in Dream island**

# Torgersen Island

In [None]:
df = data[data.island=='Torgersen']
print(df.species.value_counts())
df.species.value_counts().plot(kind='bar')

# Observation

**Torgersen island Contains only Adelie penguins**

# Concluding Observations from above insights

* All chinstrap penguins live in Dream Island
* All Gentoo penguins live in Biscoe Island
* Adelie penguins are distributed everywhere
* Torgersen Island contains only one type of penguin which is Adelie