<h1>IMPORT LIBRARIES</h1>

In [3]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

<h1>IMPORTING DATASETS, LOCALLY, TO DATAFRAME</h1>

In [4]:
penguins = pd.read_csv("./datasets/penguins.csv")

In [5]:
penguins

Unnamed: 0,rowid,species,island,bill_length_mm,bill_depth_mm,flipper_length_mm,body_mass_g,sex,year
0,1,Adelie,Torgersen,39.1,18.7,181.0,3750.0,male,2007
1,2,Adelie,Torgersen,39.5,17.4,186.0,3800.0,female,2007
2,3,Adelie,Torgersen,40.3,18.0,195.0,3250.0,female,2007
3,4,Adelie,Torgersen,,,,,,2007
4,5,Adelie,Torgersen,36.7,19.3,193.0,3450.0,female,2007
...,...,...,...,...,...,...,...,...,...
339,340,Chinstrap,Dream,55.8,19.8,207.0,4000.0,male,2009
340,341,Chinstrap,Dream,43.5,18.1,202.0,3400.0,female,2009
341,342,Chinstrap,Dream,49.6,18.2,193.0,3775.0,male,2009
342,343,Chinstrap,Dream,50.8,19.0,210.0,4100.0,male,2009


In [6]:
penguins["wkilo"] = np.round(penguins["body_mass_g"]/1000,1)

In [7]:
penguins.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 344 entries, 0 to 343
Data columns (total 10 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   rowid              344 non-null    int64  
 1   species            344 non-null    object 
 2   island             344 non-null    object 
 3   bill_length_mm     342 non-null    float64
 4   bill_depth_mm      342 non-null    float64
 5   flipper_length_mm  342 non-null    float64
 6   body_mass_g        342 non-null    float64
 7   sex                333 non-null    object 
 8   year               344 non-null    int64  
 9   wkilo              342 non-null    float64
dtypes: float64(5), int64(2), object(3)
memory usage: 27.0+ KB


In [8]:
penguins['wkilo']

0      3.8
1      3.8
2      3.2
3      NaN
4      3.4
      ... 
339    4.0
340    3.4
341    3.8
342    4.1
343    3.8
Name: wkilo, Length: 344, dtype: float64

<h1>Checking the null values</h1>

In [9]:
penguins.isnull().sum()

rowid                 0
species               0
island                0
bill_length_mm        2
bill_depth_mm         2
flipper_length_mm     2
body_mass_g           2
sex                  11
year                  0
wkilo                 2
dtype: int64

<h1>FILLING NULL VALUES</h1>

In [10]:
Average_bill_length_mm = penguins["bill_length_mm"].mean()
Average_bill_depth_mm = penguins["bill_depth_mm"].mean()
Average_flipper_length_mm = penguins["flipper_length_mm"].mean()
Average_body_mass_g = penguins["body_mass_g"].mean()
MostAppearingSex = penguins["sex"].mode()[0] #.mode() always return choices, we must specify to get the top results
Average_wkilo = penguins['wkilo'].mean()

In [11]:
penguins["bill_length_mm"] = penguins["bill_length_mm"].fillna(Average_bill_length_mm)
penguins["bill_depth_mm"] = penguins["bill_depth_mm"].fillna(Average_bill_depth_mm)
penguins["flipper_length_mm"] = penguins["flipper_length_mm"].fillna(Average_flipper_length_mm)
penguins["body_mass_g"] = penguins["flipper_length_mm"].fillna(Average_body_mass_g)
penguins["sex"] = penguins["sex"].fillna(MostAppearingSex)
penguins['wkilo'] = penguins['wkilo'].fillna(Average_wkilo)

In [12]:
penguins.isnull().sum()

rowid                0
species              0
island               0
bill_length_mm       0
bill_depth_mm        0
flipper_length_mm    0
body_mass_g          0
sex                  0
year                 0
wkilo                0
dtype: int64

<h1>CONVERT NON COMPUTABLE COLUMN TO CATEGORY</h1>

In [13]:
penguins["rowid"] = penguins["rowid"].astype('category')
penguins["species"] = penguins["species"].astype('category')
penguins["island"] = penguins["island"].astype('category')
penguins["sex"] = penguins["sex"].astype('category')
penguins["year"] = penguins["year"].astype('category')

In [14]:
penguins.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 344 entries, 0 to 343
Data columns (total 10 columns):
 #   Column             Non-Null Count  Dtype   
---  ------             --------------  -----   
 0   rowid              344 non-null    category
 1   species            344 non-null    category
 2   island             344 non-null    category
 3   bill_length_mm     344 non-null    float64 
 4   bill_depth_mm      344 non-null    float64 
 5   flipper_length_mm  344 non-null    float64 
 6   body_mass_g        344 non-null    float64 
 7   sex                344 non-null    category
 8   year               344 non-null    category
 9   wkilo              344 non-null    float64 
dtypes: category(5), float64(5)
memory usage: 26.9 KB


<h2>Gentoo In All Island</h2>

In [15]:
def findIslandBirds(island, species):
    return penguins[(penguins['island'] == island) & (penguins['species']== species)]


In [16]:
gentooBiscoe = findIslandBirds('Biscoe','Gentoo')
gentooDream = findIslandBirds('Dream','Gentoo')
gentooTorg = findIslandBirds('Torgersen','Gentoo')


In [17]:
gentooBiscoe

Unnamed: 0,rowid,species,island,bill_length_mm,bill_depth_mm,flipper_length_mm,body_mass_g,sex,year,wkilo
152,153,Gentoo,Biscoe,46.10000,13.20000,211.000000,211.000000,female,2007,4.500000
153,154,Gentoo,Biscoe,50.00000,16.30000,230.000000,230.000000,male,2007,5.700000
154,155,Gentoo,Biscoe,48.70000,14.10000,210.000000,210.000000,female,2007,4.400000
155,156,Gentoo,Biscoe,50.00000,15.20000,218.000000,218.000000,male,2007,5.700000
156,157,Gentoo,Biscoe,47.60000,14.50000,215.000000,215.000000,male,2007,5.400000
...,...,...,...,...,...,...,...,...,...,...
271,272,Gentoo,Biscoe,43.92193,17.15117,200.915205,200.915205,male,2009,4.202047
272,273,Gentoo,Biscoe,46.80000,14.30000,215.000000,215.000000,female,2009,4.800000
273,274,Gentoo,Biscoe,50.40000,15.70000,222.000000,222.000000,male,2009,5.800000
274,275,Gentoo,Biscoe,45.20000,14.80000,212.000000,212.000000,female,2009,5.200000


In [18]:
gentooDream

Unnamed: 0,rowid,species,island,bill_length_mm,bill_depth_mm,flipper_length_mm,body_mass_g,sex,year,wkilo


In [19]:
gentooTorg

Unnamed: 0,rowid,species,island,bill_length_mm,bill_depth_mm,flipper_length_mm,body_mass_g,sex,year,wkilo


<h1>INSIGHT #1</h1>
<h2>Gentoo exclusively lives in Biscoe</h2>

<h1>Adelie in All Island</h1>

In [20]:
adelieBiscoe = findIslandBirds('Biscoe','Adelie')
adelieDream = findIslandBirds('Dream','Adelie')
adelieTorg = findIslandBirds('Torgersen','Adelie')

In [21]:
len(adelieBiscoe)
print(f'''
ADELIE

BISCOE: {len(adelieBiscoe)}
DREAM: {len(adelieDream)}
TORGERSEN: {len(adelieTorg)}


''')



ADELIE

BISCOE: 44
DREAM: 56
TORGERSEN: 52





<h1>INSIGHT #2</h1>
<h2>ADELIE MAJORITY LIVES ON DREAM ISLAND</h2>

<h1>Does the island impact the body mass of a penguin species</h1>

In [22]:
adelieBiscoe = findIslandBirds('Biscoe','Adelie')
adelieDream = findIslandBirds('Dream','Adelie')
adelieTorgersen = findIslandBirds('Torgersen','Adelie')


print(f'''

AVERAGE WEIGHT OF ADELIE IN EACH ISLAND

Biscoe: {np.round(adelieBiscoe['wkilo'].mean(),2)}
Dream: {np.round(adelieDream['wkilo'].mean(),2)}
Torgersen: {np.round(adelieTorgersen['wkilo'].mean(),2)}

''')



AVERAGE WEIGHT OF ADELIE IN EACH ISLAND

Biscoe: 3.72
Dream: 3.69
Torgersen: 3.71




In [23]:
np.round(adelieBiscoe['wkilo'].mean(),1)

3.7

<h1>INSIGHT #3</h1>
<h2>ISLAND DOES NOT IMPACT THE BODY MASS OF ADELIE SPECIES</h2>

<h1>Determine the weight of each species in general</h1>

In [24]:
gentoSpecies = penguins[penguins['species'] == 'Gentoo']
adelieSpecies = penguins[penguins['species'] == 'Adelie']
chinSpecies = penguins[penguins['species'] == 'Chinstrap']

In [25]:
a = gentoSpecies['wkilo'].mean()
b = adelieSpecies['wkilo'].mean()
c = chinSpecies['wkilo'].mean()

print(f'''

AVERAGE WEIGHT OF EACH SPECIES OF PENGUINS

GENTOO: {np.round(a,1)}
ADELIE: {np.round(b,1)}
CHINSTRAP: {np.round(c,1)}

''')



AVERAGE WEIGHT OF EACH SPECIES OF PENGUINS

GENTOO: 5.1
ADELIE: 3.7
CHINSTRAP: 3.7




<h1>INSIGHT #4</h1>
<h1>The Average weight of Gentoo Species is 5.1</h1>
<br/>
<h1>INSIGHT #5</h1>
<h1>The Average weight of Adelie Species is 3.7</h1>
<br/>
<h1>INSIGHT #6</h1>
<h1>The Average weight of Chinstrap Species is 3.7</h1>

<H1>DATA AGGREGATIONS</H1>

<ol>
    <li>Name the group by refenrecne data frame</li>
    <li>Call the dataframe</li>
    <li>groupby() function</li>
    <li>columns to group</li>
    <li>group by function</li>
</ol>

In [26]:
penguin_agg = penguins.groupby(['species','island']).size()

  penguin_agg = penguins.groupby(['species','island']).size()


In [27]:
penguin_agg

species    island   
Adelie     Biscoe        44
           Dream         56
           Torgersen     52
Chinstrap  Biscoe         0
           Dream         68
           Torgersen      0
Gentoo     Biscoe       124
           Dream          0
           Torgersen      0
dtype: int64

In [28]:
penguin_agg2 = np.round(penguins.groupby(['species','island'], observed=False)['wkilo'].mean(),1)

In [29]:
penguin_agg2

species    island   
Adelie     Biscoe       3.7
           Dream        3.7
           Torgersen    3.7
Chinstrap  Biscoe       NaN
           Dream        3.7
           Torgersen    NaN
Gentoo     Biscoe       5.1
           Dream        NaN
           Torgersen    NaN
Name: wkilo, dtype: float64

<h1>WHAT YEAR HAS THE HIGHEST RECORDED 'male' and 'Adelie' and in what island?</h1>

In [30]:
penguin_agg3 = penguins.groupby(['species','island','year','sex']).size()

  penguin_agg3 = penguins.groupby(['species','island','year','sex']).size()


In [31]:
penguin_agg3.unstack()

Unnamed: 0_level_0,Unnamed: 1_level_0,sex,female,male
species,island,year,Unnamed: 3_level_1,Unnamed: 4_level_1
Adelie,Biscoe,2007,5,5
Adelie,Biscoe,2008,9,9
Adelie,Biscoe,2009,8,8
Adelie,Dream,2007,9,11
Adelie,Dream,2008,8,8
Adelie,Dream,2009,10,10
Adelie,Torgersen,2007,8,12
Adelie,Torgersen,2008,8,8
Adelie,Torgersen,2009,8,8
Chinstrap,Biscoe,2007,0,0


<h1>INSIGHT #7</h1>
<h2>The Weight in Kilo of Adelie in Torgersen island dropped by 4 kilo from 2007 and remain unchanged in following years</h2>

<h1>INSIGHT #8</h1>
<h2>The heaviest recorded of Adelie is in Dream Island in 2007 and 2009</h2>

In [32]:
penguin_agg3 = np.round(penguins.groupby(['species','sex'])['wkilo'].mean(),1)

  penguin_agg3 = np.round(penguins.groupby(['species','sex'])['wkilo'].mean(),1)


In [33]:
penguin_agg3.unstack()

sex,female,male
species,Unnamed: 1_level_1,Unnamed: 2_level_1
Adelie,3.4,4.0
Chinstrap,3.5,3.9
Gentoo,4.7,5.4


<h1>INSIGHT #9</h1>
<h2>IN ADELIE, MALE IS THE HEAVIEST with average of 4 kilo.</h2>
<br/>
<h1>INSIGHT #10</h1>
<h2>IN CHINSTRAP, MALE IS THE HEAVIST with the average of 3.9 kilo</h2>
<br/>
<h1>INSIGHT #11</h1>
<h2>IN GENTOO, MALE IS THE HEAVIST with the average of 5.4 kilo</h2>

<p>SI EZEQUIL PO NANGUNGUPYA</p>

<hr/>

<h1>CORRELATION</h1>

<h2>REQUIREMENTS</h2>
<ol>
    <li>Name variable for the correlation</li>
    <li>Call the first Column</li>
    <li>test the correlation using corr() funviton</li>
</ol>

<h1>DOES THE BILL LENGTH AFFECT THE SIZE OF THE BILL DEPTH</h1>

In [34]:
bill_length_depth_corr = penguins['bill_length_mm'].corr(penguins['bill_depth_mm'])

In [35]:
bill_length_depth_corr

-0.23505287035553282

<h1>INSIGHT #12</h1>
<h1>THERE IS NO CORRELATION BETWEEN BILL LENGTH AND BILL DEPTH AS IT YIELDS -0.2 P-VALUE</h1>

<h1>DOES THE BODY MASS AFFECT THE BILL DEPTH</h1>

In [36]:
mass_depth_corr = penguins['body_mass_g'].corr(penguins['bill_depth_mm'])

In [37]:
mass_depth_corr

-0.5838512164654123

<h1>INSIGHT #13</h1>
<h1>THE BODY MASS DOES NOT AFFECT THE BILL DEPTH OF A PENGUINAS IT YIELDS A PROBABILITY VALUE OF -0.5</h1>

<h1>DOES THE BILL LENGTH AND FLIPPER LENGTH CORRELATES EACH OTHER</h1>

In [38]:
bill_flipper_corr = penguins['bill_length_mm'].corr(penguins['flipper_length_mm'])

In [39]:
bill_flipper_corr

0.6561813407464278

<h1>INSIGHT #14</h1>
<h1>THE FLIPPER LENGTH AND BILL LENGTH IS CONNECTED MEANING THAT THERE ARE SIGNIFICANT CORRELATIONS</h1>

<h1>DOES THE BODY MASS AFFECT THE BILL LENGTH</h1>

In [40]:
mass_length_corr = penguins['body_mass_g'].corr(penguins['bill_length_mm'])

In [41]:
mass_length_corr

0.6561813407464278

<h1>INSIGHT #15</h1>
<h1>THE BODY MASS DOES IN FACT AFFECT THE LENGTH OF A PENGUINS LENGTH</h1>

In [42]:
penguins.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 344 entries, 0 to 343
Data columns (total 10 columns):
 #   Column             Non-Null Count  Dtype   
---  ------             --------------  -----   
 0   rowid              344 non-null    category
 1   species            344 non-null    category
 2   island             344 non-null    category
 3   bill_length_mm     344 non-null    float64 
 4   bill_depth_mm      344 non-null    float64 
 5   flipper_length_mm  344 non-null    float64 
 6   body_mass_g        344 non-null    float64 
 7   sex                344 non-null    category
 8   year               344 non-null    category
 9   wkilo              344 non-null    float64 
dtypes: category(5), float64(5)
memory usage: 26.9 KB


<h1>CORRELATION BETWEEN QUALITATIVE AND QUANTITATIVE</h1>

<h3>Sex and Body Mass</h3>

<h2>Library for ONE  WAY ANNOVA -- SCIENCE PYTHON (scipy)</h2>

In [43]:
from scipy.stats import f_oneway

<h3>Requirements for one way annova</h3>

<h3>Filter columns and its data</h3>

In [44]:
male = penguins[penguins['sex'] == 'male']['body_mass_g']
female = penguins[penguins['sex'] == 'female']['body_mass_g']

f_stats, p_value = f_oneway(male, female)

print(p_value)

4.88065474065626e-06


<h1>INSIGHT #16</h1>
<h2>There is a correlation between sex and body mass</h2>

In [45]:
male

0      181.000000
3      200.915205
5      190.000000
7      195.000000
8      193.000000
          ...    
334    202.000000
336    206.000000
339    207.000000
341    193.000000
342    210.000000
Name: body_mass_g, Length: 179, dtype: float64