<h3> Palmer Penguin Data Analytics </h3>
<hr>
<h3> Analyst: Nathaniel Esguerra </h3>

In [1]:
import pandas as pd
import numpy as np

In [2]:
penguins = pd.read_csv('datasets\\penguins.csv')

In [3]:
penguins

Unnamed: 0,species,island,bill_length_mm,bill_depth_mm,flipper_length_mm,body_mass_g,sex,year
0,Adelie,Torgersen,39.1,18.7,181.0,3750.0,male,2007
1,Adelie,Torgersen,39.5,17.4,186.0,3800.0,female,2007
2,Adelie,Torgersen,40.3,18.0,195.0,3250.0,female,2007
3,Adelie,Torgersen,,,,,,2007
4,Adelie,Torgersen,36.7,19.3,193.0,3450.0,female,2007
...,...,...,...,...,...,...,...,...
339,Chinstrap,Dream,55.8,19.8,207.0,4000.0,male,2009
340,Chinstrap,Dream,43.5,18.1,202.0,3400.0,female,2009
341,Chinstrap,Dream,49.6,18.2,193.0,3775.0,male,2009
342,Chinstrap,Dream,50.8,19.0,210.0,4100.0,male,2009


<h3> How to fillup Null Values</h3>

In [4]:
chinstrap_weight = penguins[(penguins['species'] == "Chinstrap") & (penguins['body_mass_g'] >= 4000)]

In [5]:
penguins['bill_length_mm'].mean()

np.float64(43.9219298245614)

In [6]:
penguins['bill_length_mm'].fillna(penguins['bill_length_mm'].mean())

0      39.10000
1      39.50000
2      40.30000
3      43.92193
4      36.70000
         ...   
339    55.80000
340    43.50000
341    49.60000
342    50.80000
343    50.20000
Name: bill_length_mm, Length: 344, dtype: float64

In [7]:
penguins['bill_length_mm'] = penguins['bill_length_mm'].fillna(penguins['bill_length_mm'].mean())

In [8]:
penguins.isnull().sum()

species               0
island                0
bill_length_mm        0
bill_depth_mm         2
flipper_length_mm     2
body_mass_g           2
sex                  11
year                  0
dtype: int64

In [9]:
penguins['sex'].mode()

0    male
Name: sex, dtype: object

In [10]:
penguins['sex'] = penguins['sex'].fillna(penguins['sex'].mode()[0])

In [11]:
penguins['bill_depth_mm'].mean()

np.float64(17.151169590643274)

In [12]:
penguins['bill_depth_mm'] = penguins['bill_depth_mm'].fillna(penguins['bill_depth_mm'].mean())

In [13]:
penguins['flipper_length_mm'].mean()

np.float64(200.91520467836258)

In [14]:
penguins['flipper_length_mm'] = penguins['flipper_length_mm'].fillna(penguins['flipper_length_mm'].mean())

In [15]:
penguins['body_mass_g'].mean()

np.float64(4201.754385964912)

In [16]:
penguins['body_mass_g'] = penguins['body_mass_g'].fillna(penguins['body_mass_g'].mean())

In [17]:
penguins.isnull().sum()

species              0
island               0
bill_length_mm       0
bill_depth_mm        0
flipper_length_mm    0
body_mass_g          0
sex                  0
year                 0
dtype: int64

<h3> Descriptive Analysis </h3>

In [18]:
penguins.describe()

Unnamed: 0,bill_length_mm,bill_depth_mm,flipper_length_mm,body_mass_g,year
count,344.0,344.0,344.0,344.0,344.0
mean,43.92193,17.15117,200.915205,4201.754386,2008.02907
std,5.443643,1.969027,14.020657,799.613058,0.818356
min,32.1,13.1,172.0,2700.0,2007.0
25%,39.275,15.6,190.0,3550.0,2007.0
50%,44.25,17.3,197.0,4050.0,2008.0
75%,48.5,18.7,213.0,4750.0,2009.0
max,59.6,21.5,231.0,6300.0,2009.0


In [19]:
penguins.describe().T

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
bill_length_mm,344.0,43.92193,5.443643,32.1,39.275,44.25,48.5,59.6
bill_depth_mm,344.0,17.15117,1.969027,13.1,15.6,17.3,18.7,21.5
flipper_length_mm,344.0,200.915205,14.020657,172.0,190.0,197.0,213.0,231.0
body_mass_g,344.0,4201.754386,799.613058,2700.0,3550.0,4050.0,4750.0,6300.0
year,344.0,2008.02907,0.818356,2007.0,2007.0,2008.0,2009.0,2009.0


<h3> Cleaning the Data Frame</h3>

In [20]:
penguins.isnull().sum()

species              0
island               0
bill_length_mm       0
bill_depth_mm        0
flipper_length_mm    0
body_mass_g          0
sex                  0
year                 0
dtype: int64

<h3> Creating Data Frames </h3>

In [21]:
penguins['species'].unique()

array(['Adelie', 'Gentoo', 'Chinstrap'], dtype=object)

In [22]:
adelie_df = penguins[penguins['species'] == 'Adelie']

In [23]:
adelie_df

Unnamed: 0,species,island,bill_length_mm,bill_depth_mm,flipper_length_mm,body_mass_g,sex,year
0,Adelie,Torgersen,39.10000,18.70000,181.000000,3750.000000,male,2007
1,Adelie,Torgersen,39.50000,17.40000,186.000000,3800.000000,female,2007
2,Adelie,Torgersen,40.30000,18.00000,195.000000,3250.000000,female,2007
3,Adelie,Torgersen,43.92193,17.15117,200.915205,4201.754386,male,2007
4,Adelie,Torgersen,36.70000,19.30000,193.000000,3450.000000,female,2007
...,...,...,...,...,...,...,...,...
147,Adelie,Dream,36.60000,18.40000,184.000000,3475.000000,female,2009
148,Adelie,Dream,36.00000,17.80000,195.000000,3450.000000,female,2009
149,Adelie,Dream,37.80000,18.10000,193.000000,3750.000000,male,2009
150,Adelie,Dream,36.00000,17.10000,187.000000,3700.000000,female,2009


In [24]:
gentoo_df = penguins[penguins['species'] == "Gentoo"]

In [25]:
gentoo_df

Unnamed: 0,species,island,bill_length_mm,bill_depth_mm,flipper_length_mm,body_mass_g,sex,year
152,Gentoo,Biscoe,46.10000,13.20000,211.000000,4500.000000,female,2007
153,Gentoo,Biscoe,50.00000,16.30000,230.000000,5700.000000,male,2007
154,Gentoo,Biscoe,48.70000,14.10000,210.000000,4450.000000,female,2007
155,Gentoo,Biscoe,50.00000,15.20000,218.000000,5700.000000,male,2007
156,Gentoo,Biscoe,47.60000,14.50000,215.000000,5400.000000,male,2007
...,...,...,...,...,...,...,...,...
271,Gentoo,Biscoe,43.92193,17.15117,200.915205,4201.754386,male,2009
272,Gentoo,Biscoe,46.80000,14.30000,215.000000,4850.000000,female,2009
273,Gentoo,Biscoe,50.40000,15.70000,222.000000,5750.000000,male,2009
274,Gentoo,Biscoe,45.20000,14.80000,212.000000,5200.000000,female,2009


In [26]:
chinstrap_df = penguins[penguins['species'] == "Chinstrap"]

In [27]:
chinstrap_df

Unnamed: 0,species,island,bill_length_mm,bill_depth_mm,flipper_length_mm,body_mass_g,sex,year
276,Chinstrap,Dream,46.5,17.9,192.0,3500.0,female,2007
277,Chinstrap,Dream,50.0,19.5,196.0,3900.0,male,2007
278,Chinstrap,Dream,51.3,19.2,193.0,3650.0,male,2007
279,Chinstrap,Dream,45.4,18.7,188.0,3525.0,female,2007
280,Chinstrap,Dream,52.7,19.8,197.0,3725.0,male,2007
...,...,...,...,...,...,...,...,...
339,Chinstrap,Dream,55.8,19.8,207.0,4000.0,male,2009
340,Chinstrap,Dream,43.5,18.1,202.0,3400.0,female,2009
341,Chinstrap,Dream,49.6,18.2,193.0,3775.0,male,2009
342,Chinstrap,Dream,50.8,19.0,210.0,4100.0,male,2009


In [28]:
penguins['body_mass_g'] >= 4000

0      False
1      False
2      False
3       True
4      False
       ...  
339     True
340    False
341    False
342     True
343    False
Name: body_mass_g, Length: 344, dtype: bool

In [29]:
chinstrap_weight.info()

<class 'pandas.core.frame.DataFrame'>
Index: 16 entries, 284 to 342
Data columns (total 8 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   species            16 non-null     object 
 1   island             16 non-null     object 
 2   bill_length_mm     16 non-null     float64
 3   bill_depth_mm      16 non-null     float64
 4   flipper_length_mm  16 non-null     float64
 5   body_mass_g        16 non-null     float64
 6   sex                16 non-null     object 
 7   year               16 non-null     int64  
dtypes: float64(4), int64(1), object(3)
memory usage: 1.1+ KB


<h3> How to change data type of a column </h3>

In [30]:
penguins['species'] = penguins['species'].astype('category')
penguins['year'] = penguins['year'].astype('category')
penguins['island'] = penguins['island'].astype('category')
penguins['sex'] = penguins['sex'].astype('category')

In [31]:
penguins.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 344 entries, 0 to 343
Data columns (total 8 columns):
 #   Column             Non-Null Count  Dtype   
---  ------             --------------  -----   
 0   species            344 non-null    category
 1   island             344 non-null    category
 2   bill_length_mm     344 non-null    float64 
 3   bill_depth_mm      344 non-null    float64 
 4   flipper_length_mm  344 non-null    float64 
 5   body_mass_g        344 non-null    float64 
 6   sex                344 non-null    category
 7   year               344 non-null    category
dtypes: category(4), float64(4)
memory usage: 12.7 KB


In [32]:
penguins.describe()

Unnamed: 0,bill_length_mm,bill_depth_mm,flipper_length_mm,body_mass_g
count,344.0,344.0,344.0,344.0
mean,43.92193,17.15117,200.915205,4201.754386
std,5.443643,1.969027,14.020657,799.613058
min,32.1,13.1,172.0,2700.0
25%,39.275,15.6,190.0,3550.0
50%,44.25,17.3,197.0,4050.0
75%,48.5,18.7,213.0,4750.0
max,59.6,21.5,231.0,6300.0


In [33]:
penguins['mass_kg'] = np.round(penguins['body_mass_g'] / 1000,1)

In [34]:
penguins

Unnamed: 0,species,island,bill_length_mm,bill_depth_mm,flipper_length_mm,body_mass_g,sex,year,mass_kg
0,Adelie,Torgersen,39.10000,18.70000,181.000000,3750.000000,male,2007,3.8
1,Adelie,Torgersen,39.50000,17.40000,186.000000,3800.000000,female,2007,3.8
2,Adelie,Torgersen,40.30000,18.00000,195.000000,3250.000000,female,2007,3.2
3,Adelie,Torgersen,43.92193,17.15117,200.915205,4201.754386,male,2007,4.2
4,Adelie,Torgersen,36.70000,19.30000,193.000000,3450.000000,female,2007,3.4
...,...,...,...,...,...,...,...,...,...
339,Chinstrap,Dream,55.80000,19.80000,207.000000,4000.000000,male,2009,4.0
340,Chinstrap,Dream,43.50000,18.10000,202.000000,3400.000000,female,2009,3.4
341,Chinstrap,Dream,49.60000,18.20000,193.000000,3775.000000,male,2009,3.8
342,Chinstrap,Dream,50.80000,19.00000,210.000000,4100.000000,male,2009,4.1


<h3> Exploratory Data Analytics (EDA) </h3>

<h3> 1. Which species has the least female sex? </h3>

In [35]:
penguins[(penguins['species'] == 'Gentoo') & (penguins['sex'] == 'female')].value_counts().sum()

np.int64(58)

In [36]:
penguins[(penguins['species'] == 'Chinstrap') & (penguins['sex'] == 'female')].value_counts().sum()

np.int64(34)

In [37]:
penguins[(penguins['species'] == 'Adelie') & (penguins['sex'] == 'female')].value_counts().sum()

np.int64(73)

<h3> Chinstrap has the least female sex</h3>

<h3> 2. Which species has the highest population of male? </h3>

In [38]:
gentoo_df['sex'].value_counts()

sex
male      66
female    58
Name: count, dtype: int64

In [39]:
chinstrap_df['sex'].value_counts()

sex
female    34
male      34
Name: count, dtype: int64

adelie_df['sex'].value_counts()

<h3> Adelie has the highest population of male</h3>

<h3> 3. What year has the least male sex penguins?</h3>

In [40]:
penguins[penguins['sex'] == 'male']['year'].value_counts()

year
2009    62
2007    59
2008    58
Name: count, dtype: int64

<h3> 2007 has the least male sex penguins </h3>

<h3> 4. Which of the three islands has the highest number of species?</h3>

In [41]:
penguins['island'].value_counts()

island
Biscoe       168
Dream        124
Torgersen     52
Name: count, dtype: int64

<h3> As indicated above, the highest number of species in an island is Biscoe</h3>

<h3> 5. What species have the most bill length?</h3>

In [42]:
chinstrap_df['bill_length_mm'].mean()


np.float64(48.83382352941177)

In [43]:
gentoo_df['bill_length_mm'].mean()

np.float64(47.475983305036785)

In [44]:
adelie_df['bill_length_mm'].mean()

np.float64(38.82514427516159)

<h3> As indicated, Chinstrap has the most bill length among all species</h3>

<h3> 6. What is the average body mass of Chinstrap?</h3>

In [45]:
avg_chinstrap_mass = penguins[penguins["species"] == "Chinstrap"]["body_mass_g"].mean()
avg_chinstrap_mass

np.float64(3733.0882352941176)

<h3> The data indicates that Chinstrap penguins have an average body mass of 3733.08 grams </h3>

<h3> 7. Which island has the most penguins?</h3>

In [48]:
penguins['island'].value_counts()

island
Biscoe       168
Dream        124
Torgersen     52
Name: count, dtype: int64

<h3> Biscoe Island contains the highest number of penguins compared to the other islands </h3>