In [2]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import scipy
from sklearn.ensemble import IsolationForest

In [3]:
data = pd.read_csv("penguins_lter.csv")
colnames = list(data.select_dtypes(["number"]).columns)

In [4]:
species = data["Species"]
island = data["Island"]
cl = data["Culmen Length (mm)"].to_numpy().reshape(1,-1)
cd = data["Culmen Depth (mm)"].to_numpy().reshape(-1,1)
fl = data["Flipper Length (mm)"].to_numpy().reshape(-1,1)
bm = data["Body Mass (g)"].to_numpy().reshape(-1,1)
dc = data["Delta 13 C (o/oo)"].to_numpy().reshape(-1,1)
dn = data["Delta 15 N (o/oo)"].to_numpy().reshape(-1,1)

### Odlehlé hodnoty
Úkol: Zjistěta, zda zvolená datová sada obsahuje nějaké odlehlé hodnoty

In [5]:
data.describe()

Unnamed: 0,Sample Number,Culmen Length (mm),Culmen Depth (mm),Flipper Length (mm),Body Mass (g),Delta 15 N (o/oo),Delta 13 C (o/oo)
count,344.0,342.0,342.0,342.0,342.0,330.0,331.0
mean,63.151163,43.92193,17.15117,200.915205,4201.754386,8.733382,-25.686292
std,40.430199,5.459584,1.974793,14.061714,801.954536,0.55177,0.793961
min,1.0,32.1,13.1,172.0,2700.0,7.6322,-27.01854
25%,29.0,39.225,15.6,190.0,3550.0,8.29989,-26.320305
50%,58.0,44.45,17.3,197.0,4050.0,8.652405,-25.83352
75%,95.25,48.5,18.7,213.0,4750.0,9.172123,-25.06205
max,152.0,59.6,21.5,231.0,6300.0,10.02544,-23.78767


In [6]:
data.head()

Unnamed: 0,studyName,Sample Number,Species,Region,Island,Stage,Individual ID,Clutch Completion,Date Egg,Culmen Length (mm),Culmen Depth (mm),Flipper Length (mm),Body Mass (g),Sex,Delta 15 N (o/oo),Delta 13 C (o/oo),Comments
0,PAL0708,1,Adelie Penguin (Pygoscelis adeliae),Anvers,Torgersen,"Adult, 1 Egg Stage",N1A1,Yes,11/11/07,39.1,18.7,181.0,3750.0,MALE,,,Not enough blood for isotopes.
1,PAL0708,2,Adelie Penguin (Pygoscelis adeliae),Anvers,Torgersen,"Adult, 1 Egg Stage",N1A2,Yes,11/11/07,39.5,17.4,186.0,3800.0,FEMALE,8.94956,-24.69454,
2,PAL0708,3,Adelie Penguin (Pygoscelis adeliae),Anvers,Torgersen,"Adult, 1 Egg Stage",N2A1,Yes,11/16/07,40.3,18.0,195.0,3250.0,FEMALE,8.36821,-25.33302,
3,PAL0708,4,Adelie Penguin (Pygoscelis adeliae),Anvers,Torgersen,"Adult, 1 Egg Stage",N2A2,Yes,11/16/07,,,,,,,,Adult not sampled.
4,PAL0708,5,Adelie Penguin (Pygoscelis adeliae),Anvers,Torgersen,"Adult, 1 Egg Stage",N3A1,Yes,11/16/07,36.7,19.3,193.0,3450.0,FEMALE,8.76651,-25.32426,


In [7]:
def remove_nan(cl):
    return cl[~np.isnan(cl)].reshape(-1,1)

In [8]:
def find_outliers(cl):
    # finding the 1st quartile
    q1 = np.quantile(cl, 0.25)
    
    # finding the 3rd quartile
    q3 = np.quantile(cl, 0.75)
    med = np.median(cl)
    
    # finding the iqr region
    iqr = q3-q1
    
    # finding upper and lower whiskers
    upper_bound = q3+(1.5*iqr)
    lower_bound = q1-(1.5*iqr)
    # print(iqr, upper_bound, lower_bound)

    outliers = cl[(cl <= lower_bound) | (cl >= upper_bound)]
    print('The following are the outliers in the boxplot:{}'.format(outliers))

cl = remove_nan(cl)
cd = remove_nan(cd)
fl = remove_nan(fl)
bm = remove_nan(bm)
dc = remove_nan(dc)
dn = remove_nan(dn)

find_outliers(cl)
find_outliers(cd)
find_outliers(fl)
find_outliers(bm)
find_outliers(dc)
find_outliers(dn)

The following are the outliers in the boxplot:[]
The following are the outliers in the boxplot:[]
The following are the outliers in the boxplot:[]
The following are the outliers in the boxplot:[]
The following are the outliers in the boxplot:[]
The following are the outliers in the boxplot:[]


Datová sada neobsahuje žádné odlehlé hodnoty, pokud zkoumáme všechny druhy tučňáků zároveň. Dále budeme zkoumat odlehlé hodnoty pro každý druh tučňáka zvlášť.

In [9]:
def outliers_for_species(species, data):
    data = data[data["Species"] == species]

    species = data["Species"]
    island = data["Island"]
    cl = data["Culmen Length (mm)"].to_numpy().reshape(1,-1)
    cd = data["Culmen Depth (mm)"].to_numpy().reshape(-1,1)
    fl = data["Flipper Length (mm)"].to_numpy().reshape(-1,1)
    bm = data["Body Mass (g)"].to_numpy().reshape(-1,1)
    dc = data["Delta 13 C (o/oo)"].to_numpy().reshape(-1,1)
    dn = data["Delta 15 N (o/oo)"].to_numpy().reshape(-1,1)

    find_outliers(remove_nan(cl))
    find_outliers(remove_nan(cd))
    find_outliers(remove_nan(fl))
    find_outliers(remove_nan(bm))
    find_outliers(remove_nan(dc))
    find_outliers(remove_nan(dn))

In [10]:
outliers_for_species("Adelie Penguin (Pygoscelis adeliae)", data)

The following are the outliers in the boxplot:[]
The following are the outliers in the boxplot:[21.5]
The following are the outliers in the boxplot:[172. 210.]
The following are the outliers in the boxplot:[]
The following are the outliers in the boxplot:[]
The following are the outliers in the boxplot:[]


In [11]:
outliers_for_species("Chinstrap penguin (Pygoscelis antarctica)", data)

The following are the outliers in the boxplot:[]
The following are the outliers in the boxplot:[]
The following are the outliers in the boxplot:[]
The following are the outliers in the boxplot:[4800. 2700.]
The following are the outliers in the boxplot:[-23.89017 -25.1455  -23.78767]
The following are the outliers in the boxplot:[]


In [12]:
outliers_for_species("Gentoo penguin (Pygoscelis papua)", data)

The following are the outliers in the boxplot:[59.6]
The following are the outliers in the boxplot:[]
The following are the outliers in the boxplot:[]
The following are the outliers in the boxplot:[]
The following are the outliers in the boxplot:[]
The following are the outliers in the boxplot:[]


<b>Adelie Penguin:</b>
* 1 odlehlá hodnota pro culmen depth (21.5)
* 2 odlehlé hodnoty pro flipper length (172 a 210)

<b>Chinstrap Penguin:</b>
* 2 odlehlé hodnoty pro body mass (4800 a 2700)
* 3 odlehlé hodnoty pro delta 13

<b>Gentoo Penguin:</b>
* 1 odlehlé hodnotao pro culmen length (59.6)

### Chybějící hodnoty
Úkol: Proveďte podrobnou analýzu chybějících hodnot (celkový počet chybějících hodnot, počet objektů s více chybějícími hodnotami, atd.)

In [13]:
cl = data["Culmen Length (mm)"].to_numpy().reshape(-1,1)
cd = data["Culmen Depth (mm)"].to_numpy().reshape(-1,1)
fl = data["Flipper Length (mm)"].to_numpy().reshape(-1,1)
bm = data["Body Mass (g)"].to_numpy().reshape(-1,1)
dc = data["Delta 13 C (o/oo)"].to_numpy().reshape(-1,1)
dn = data["Delta 15 N (o/oo)"].to_numpy().reshape(-1,1)

In [14]:
def get_number_of_missing_values(array, name):
    total = array.shape[0]
    missing = array[np.isnan(array)].reshape(-1,1).shape[0]
    print(name, ": ", missing, "/", total)

In [15]:
get_number_of_missing_values(cl, "Culmen length")
get_number_of_missing_values(cd, "Culmen depth")
get_number_of_missing_values(fl, "Flipper length")
get_number_of_missing_values(bm, "Body mass")
get_number_of_missing_values(dc, "Delta 13")
get_number_of_missing_values(dn, "Delta 15")

Culmen length :  2 / 344
Culmen depth :  2 / 344
Flipper length :  2 / 344
Body mass :  2 / 344
Delta 13 :  13 / 344
Delta 15 :  14 / 344


In [16]:
name = data["studyName"]
number = data["Sample Number"]
species = data["Species"]
island = data["Island"]
stage = data["Stage"]
id = data["Individual ID"]
clutch = data["Clutch Completion"]
date = data["Date Egg"]
sex = data["Sex"]
comments = data["Comments"]

In [17]:
def get_number_of_missing_strings(series, name):
    total = len(series)
    missing = series.isnull().sum()
    print(name, ": ", missing, "/", total)

In [18]:
get_number_of_missing_strings(name, "Name")
get_number_of_missing_strings(number, "Sample number")
get_number_of_missing_strings(species, "Species")
get_number_of_missing_strings(island, "Island")
get_number_of_missing_strings(stage, "Stage")
get_number_of_missing_strings(id, "Idividual ID")
get_number_of_missing_strings(clutch, "Clutch completion")
get_number_of_missing_strings(date, "Date egg")
get_number_of_missing_strings(sex, "Sex")
get_number_of_missing_strings(comments, "Comments")

Name :  0 / 344
Sample number :  0 / 344
Species :  0 / 344
Island :  0 / 344
Stage :  0 / 344
Idividual ID :  0 / 344
Clutch completion :  0 / 344
Date egg :  0 / 344
Sex :  10 / 344
Comments :  318 / 344


Celkově je 35 numerických chybějících hodnot.

Dále chybí 10 hodnot u pohlaví tučňáka a 318 komentářů (což není důležité).

Důležitých chybějících hodnot je tedy 35 + 10 = 45.

In [19]:
print(data.shape)

# all cells are full
not_empty = data.dropna()
print(not_empty.shape)

# don't care about comments
not_empty = data.drop("Comments", axis=1)
not_empty = not_empty.dropna()
print(not_empty.shape)

empty = data.drop("Comments", axis=1)
empty = empty[empty.isnull().any(1)]
print(empty.shape)

empty

(344, 17)
(13, 17)
(325, 16)
(19, 16)


Unnamed: 0,studyName,Sample Number,Species,Region,Island,Stage,Individual ID,Clutch Completion,Date Egg,Culmen Length (mm),Culmen Depth (mm),Flipper Length (mm),Body Mass (g),Sex,Delta 15 N (o/oo),Delta 13 C (o/oo)
0,PAL0708,1,Adelie Penguin (Pygoscelis adeliae),Anvers,Torgersen,"Adult, 1 Egg Stage",N1A1,Yes,11/11/07,39.1,18.7,181.0,3750.0,MALE,,
3,PAL0708,4,Adelie Penguin (Pygoscelis adeliae),Anvers,Torgersen,"Adult, 1 Egg Stage",N2A2,Yes,11/16/07,,,,,,,
8,PAL0708,9,Adelie Penguin (Pygoscelis adeliae),Anvers,Torgersen,"Adult, 1 Egg Stage",N5A1,Yes,11/9/07,34.1,18.1,193.0,3475.0,,,
9,PAL0708,10,Adelie Penguin (Pygoscelis adeliae),Anvers,Torgersen,"Adult, 1 Egg Stage",N5A2,Yes,11/9/07,42.0,20.2,190.0,4250.0,,9.13362,-25.09368
10,PAL0708,11,Adelie Penguin (Pygoscelis adeliae),Anvers,Torgersen,"Adult, 1 Egg Stage",N6A1,Yes,11/9/07,37.8,17.1,186.0,3300.0,,8.63243,-25.21315
11,PAL0708,12,Adelie Penguin (Pygoscelis adeliae),Anvers,Torgersen,"Adult, 1 Egg Stage",N6A2,Yes,11/9/07,37.8,17.3,180.0,3700.0,,,
12,PAL0708,13,Adelie Penguin (Pygoscelis adeliae),Anvers,Torgersen,"Adult, 1 Egg Stage",N7A1,Yes,11/15/07,41.1,17.6,182.0,3200.0,FEMALE,,
13,PAL0708,14,Adelie Penguin (Pygoscelis adeliae),Anvers,Torgersen,"Adult, 1 Egg Stage",N7A2,Yes,11/15/07,38.6,21.2,191.0,3800.0,MALE,,
15,PAL0708,16,Adelie Penguin (Pygoscelis adeliae),Anvers,Torgersen,"Adult, 1 Egg Stage",N8A2,Yes,11/16/07,36.6,17.8,185.0,3700.0,FEMALE,,
39,PAL0708,40,Adelie Penguin (Pygoscelis adeliae),Anvers,Dream,"Adult, 1 Egg Stage",N25A2,No,11/13/07,39.8,19.1,184.0,4650.0,MALE,,


Celkově je pouze 13 řádků zcela zaplněných. Na to ale mají vliv komentáře, které jsou pouze u pár tučňáků. Vzhledem k tomu, že nepřítomnost komentáře neovlivňuje vypovídající hodnotu daného řádku, nebudeme chybějící hodnoty v komentářích uvažovat.



Když nebereme v úvahu komentáře, tak je v datové sadě 19 řádků, na kterých chybí alespoň jedna informace (těch zcela vyplněných je tedy 325).

In [20]:
empty.isnull().sum(axis=1)

0      2
3      7
8      3
9      1
10     1
11     3
12     2
13     2
15     2
39     2
41     2
46     2
47     3
212    1
246    1
250    2
286    1
324    1
339    7
dtype: int64

V tabulce výše je uveden počet chybějících hodnot pro jednotlivé řádky. Například řádky 3 a 339 obsahují každý 7 chybějících hodnot

Poznámka: číslu řádku $n$ odpovídá v tabulce po otevření v Excelu číslo řádku $n+2$

In [24]:
more_empty = empty.isnull().sum(axis=1)
more_empty[more_empty > 1]

0      2
3      7
8      3
11     3
12     2
13     2
15     2
39     2
41     2
46     2
47     3
250    2
339    7
dtype: int64