1. Load the Penguins dataset into a pandas DataFrame
2. Display the first 5 rows.
3. Calculate the average 'bill_length_mm' for each species of penguins.
4. the penguin with the highest 'body_mass_g' and display its species and other information.
5. Create a new DataFrame containing only the penguins with 'sex' as 'MALE' and 'island' as 'Torgersen'.
6. Calculate the correlation matrix for 'bill_length_mm', 'bill_depth_mm', 'flipper_length_mm', and 'body_mass_g'.
7. For each species of penguins, find the mean, median, minimum, and maximum 'body_mass_g'.
8. Replace any missing values in the 'sex' column with the most frequent value in that column.
9. Create a new column in the DataFrame called 'bill_area', which is the product of 'bill_length_mm' and 'bill_depth_mm'.
10. Group the DataFrame by 'species' and calculate the average 'body_mass_g' and 'flipper_length_mm' for each species.
11. Calculate the total count of penguins for each 'island' and 'sex' combination.


## Load the Penguins dataset into a pandas DataFrame

In [1]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline

In [2]:
df = sns.load_dataset('penguins')

## Display the first 5 rows.

In [3]:
df.head()

Unnamed: 0,species,island,bill_length_mm,bill_depth_mm,flipper_length_mm,body_mass_g,sex
0,Adelie,Torgersen,39.1,18.7,181.0,3750.0,Male
1,Adelie,Torgersen,39.5,17.4,186.0,3800.0,Female
2,Adelie,Torgersen,40.3,18.0,195.0,3250.0,Female
3,Adelie,Torgersen,,,,,
4,Adelie,Torgersen,36.7,19.3,193.0,3450.0,Female


## Calculate the average 'bill_length_mm' for each species of penguins.

In [4]:
average_bill = df.groupby('species')['bill_length_mm'].mean()

In [5]:
print(f" average 'bill_length_mm' for each species of penguins", average_bill)

 average 'bill_length_mm' for each species of penguins species
Adelie       38.791391
Chinstrap    48.833824
Gentoo       47.504878
Name: bill_length_mm, dtype: float64


## Find the penguin with the highest 'body_mass_g' and display its species and other information.

In [7]:
highest_body_mass_penguine = df.loc[df['body_mass_g'].idxmax()]
print(highest_body_mass_penguine)

species              Gentoo
island               Biscoe
bill_length_mm         49.2
bill_depth_mm          15.2
flipper_length_mm     221.0
body_mass_g          6300.0
sex                    Male
Name: 237, dtype: object


## Create a new DataFrame containing only the penguins with 'sex' as 'MALE' and 'island' as 'Torgersen'.

In [9]:
df_new = df[(df['sex']=='MALE') & (df['island']=='Torgersen')]

In [10]:
df_new

Unnamed: 0,species,island,bill_length_mm,bill_depth_mm,flipper_length_mm,body_mass_g,sex


## Calculate the correlation matrix for 'bill_length_mm', 'bill_depth_mm', 'flipper_length_mm', and 'body_mass_g'.

In [12]:
corr = df[['bill_length_mm','bill_depth_mm','flipper_length_mm','body_mass_g']].corr()
print(corr)

                   bill_length_mm  bill_depth_mm  flipper_length_mm  \
bill_length_mm           1.000000      -0.235053           0.656181   
bill_depth_mm           -0.235053       1.000000          -0.583851   
flipper_length_mm        0.656181      -0.583851           1.000000   
body_mass_g              0.595110      -0.471916           0.871202   

                   body_mass_g  
bill_length_mm        0.595110  
bill_depth_mm        -0.471916  
flipper_length_mm     0.871202  
body_mass_g           1.000000  


## For each species of penguins, find the mean, median, minimum, and maximum 'body_mass_g'.

In [14]:
stats = df.groupby('species')['body_mass_g'].agg(['mean','median','min','max'])
print(stats)

                  mean  median     min     max
species                                       
Adelie     3700.662252  3700.0  2850.0  4775.0
Chinstrap  3733.088235  3700.0  2700.0  4800.0
Gentoo     5076.016260  5000.0  3950.0  6300.0


## Replace any missing values in the 'sex' column with the most frequent value in that column.

In [16]:
df.isnull().sum()

species               0
island                0
bill_length_mm        2
bill_depth_mm         2
flipper_length_mm     2
body_mass_g           2
sex                  11
dtype: int64

In [19]:
most_frequent_sex = df['sex'].mode().iloc[0]
df['sex'].fillna(most_frequent_sex , inplace = True)

In [20]:
df.isnull().sum()

species              0
island               0
bill_length_mm       2
bill_depth_mm        2
flipper_length_mm    2
body_mass_g          2
sex                  0
dtype: int64

## Create a new column in the DataFrame called 'bill_area', which is the product of 'bill_length_mm' and 'bill_depth_mm'.

In [21]:
df['bill_area'] = df['bill_length_mm']*df['bill_depth_mm']

In [23]:
df.head()

Unnamed: 0,species,island,bill_length_mm,bill_depth_mm,flipper_length_mm,body_mass_g,sex,bill_area
0,Adelie,Torgersen,39.1,18.7,181.0,3750.0,Male,731.17
1,Adelie,Torgersen,39.5,17.4,186.0,3800.0,Female,687.3
2,Adelie,Torgersen,40.3,18.0,195.0,3250.0,Female,725.4
3,Adelie,Torgersen,,,,,Male,
4,Adelie,Torgersen,36.7,19.3,193.0,3450.0,Female,708.31


## Group the DataFrame by 'species' and calculate the average 'body_mass_g' and 'flipper_length_mm' for each species.

In [29]:
avg = df.groupby('species').agg({'body_mass_g':'mean' ,'flipper_length_mm':'mean' })
avg


Unnamed: 0_level_0,body_mass_g,flipper_length_mm
species,Unnamed: 1_level_1,Unnamed: 2_level_1
Adelie,3700.662252,189.953642
Chinstrap,3733.088235,195.823529
Gentoo,5076.01626,217.186992


## Calculate the total count of penguins for each 'island' and 'sex' combination.

In [40]:
total_count = df.groupby(['island','sex']).size().reset_index(name = 'count')

In [41]:
total_count

Unnamed: 0,island,sex,count
0,Biscoe,Female,80
1,Biscoe,Male,88
2,Dream,Female,61
3,Dream,Male,63
4,Torgersen,Female,24
5,Torgersen,Male,28
