In [35]:
# ----- IMPORTS -----
import numpy as np
import pandas as pd
from sklearn.model_selection import StratifiedShuffleSplit
pd.options.mode.chained_assignment = None  # default='warn'

In [36]:
# downloading and opening data from palmar penguin dataset
penguinData = pd.read_csv("penguins_size.csv")
# With head() I can see a quick glance at the data
penguinData.head()

Unnamed: 0,species,island,culmen_length_mm,culmen_depth_mm,flipper_length_mm,body_mass_g,sex
0,Adelie,Torgersen,39.1,18.7,181.0,3750.0,MALE
1,Adelie,Torgersen,39.5,17.4,186.0,3800.0,FEMALE
2,Adelie,Torgersen,40.3,18.0,195.0,3250.0,FEMALE
3,Adelie,Torgersen,,,,,
4,Adelie,Torgersen,36.7,19.3,193.0,3450.0,FEMALE


In [37]:
# With describe() I can view some of the numerical values and some statistical values
penguinData.describe()

Unnamed: 0,culmen_length_mm,culmen_depth_mm,flipper_length_mm,body_mass_g
count,342.0,342.0,342.0,342.0
mean,43.92193,17.15117,200.915205,4201.754386
std,5.459584,1.974793,14.061714,801.954536
min,32.1,13.1,172.0,2700.0
25%,39.225,15.6,190.0,3550.0
50%,44.45,17.3,197.0,4050.0
75%,48.5,18.7,213.0,4750.0
max,59.6,21.5,231.0,6300.0


In [38]:
# We can view the datatypes and column names clearly
penguinData.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 344 entries, 0 to 343
Data columns (total 7 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   species            344 non-null    object 
 1   island             344 non-null    object 
 2   culmen_length_mm   342 non-null    float64
 3   culmen_depth_mm    342 non-null    float64
 4   flipper_length_mm  342 non-null    float64
 5   body_mass_g        342 non-null    float64
 6   sex                334 non-null    object 
dtypes: float64(4), object(3)
memory usage: 18.9+ KB


In [46]:
# creating a split object using StratifiedShuffledSplit()
split = StratifiedShuffleSplit(n_splits=1, test_size=0.2, random_state=42) 

In [47]:
# test train split gets error because of NaN values
PD = penguinData.dropna()

In [48]:
PD.head()

Unnamed: 0,species,island,culmen_length_mm,culmen_depth_mm,flipper_length_mm,body_mass_g,sex
0,Adelie,Torgersen,39.1,18.7,181.0,3750.0,MALE
1,Adelie,Torgersen,39.5,17.4,186.0,3800.0,FEMALE
2,Adelie,Torgersen,40.3,18.0,195.0,3250.0,FEMALE
4,Adelie,Torgersen,36.7,19.3,193.0,3450.0,FEMALE
5,Adelie,Torgersen,39.3,20.6,190.0,3650.0,MALE


In [55]:
PD.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 334 entries, 0 to 343
Data columns (total 8 columns):
 #   Column             Non-Null Count  Dtype   
---  ------             --------------  -----   
 0   species            334 non-null    object  
 1   island             334 non-null    object  
 2   culmen_length_mm   334 non-null    float64 
 3   culmen_depth_mm    334 non-null    float64 
 4   flipper_length_mm  334 non-null    float64 
 5   body_mass_g        334 non-null    float64 
 6   sex                334 non-null    object  
 7   body_mass_grams    333 non-null    category
dtypes: category(1), float64(4), object(3)
memory usage: 21.4+ KB


In [60]:
PD[["body_mass_grams"]] = PD[["body_mass_grams"]].apply(pd.to_numeric)
PD.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 334 entries, 0 to 343
Data columns (total 8 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   species            334 non-null    object 
 1   island             334 non-null    object 
 2   culmen_length_mm   334 non-null    float64
 3   culmen_depth_mm    334 non-null    float64
 4   flipper_length_mm  334 non-null    float64
 5   body_mass_g        334 non-null    float64
 6   sex                334 non-null    object 
 7   body_mass_grams    333 non-null    float64
dtypes: float64(5), object(3)
memory usage: 23.5+ KB


In [66]:
PD=PD.drop(columns=['body_mass_g'])

In [67]:
PD.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 334 entries, 0 to 343
Data columns (total 7 columns):
 #   Column             Non-Null Count  Dtype   
---  ------             --------------  -----   
 0   species            334 non-null    object  
 1   island             334 non-null    object  
 2   culmen_length_mm   334 non-null    float64 
 3   culmen_depth_mm    334 non-null    float64 
 4   flipper_length_mm  334 non-null    float64 
 5   sex                334 non-null    object  
 6   body_mass_grams    333 non-null    category
dtypes: category(1), float64(3), object(3)
memory usage: 26.9+ KB


In [69]:
# I know standard deviation is 801 and min is 2700 and max is 6300
PD["body_mass_grams"] = pd.cut(PD["body_mass_grams"],
                              bins = [2700.0, 3500.0, 4300.0, 5100.0, 5900.0, 6300.0],
                              labels = [1, 2, 3, 4, 5])

In [71]:
for train_index, test_index in split.split(PD, PD["body_mass_grams"]):
    strat_train_set = PD.loc[train_index]
    strat_test_set = PD.loc[test_index]

ValueError: Input y contains NaN.