# A few short introductory exercises in NumPy

In [None]:
import numpy as np

# Create a NumPy array
data = np.array([1, 2, 3, 4, 5, 6, 7, 8, 9, 10])

# Slice the array to get elements from index 2 to 6
sliced_data = data[2:7]

print(sliced_data)

In [None]:
# Create a NumPy array
data = np.array([1, 2, 3, 4, 5])

# Append new data to the array
new_data = np.append(data, [6, 7, 8])

print(new_data)

In [None]:
# Initialize an empty array with shape (3, 3)
empty_array = np.empty((3, 3))

print(empty_array)

# Create a NumPy array
data = np.array([[1, 2, 3], [4, 5, 6]])

# Initialize an array with zeros of the same shape as 'data'
zeros_array = np.zeros_like(data)

print(zeros_array)

In [None]:
# Initialize an array using np.arange()
arr = np.arange(1, 10, 2)  # Start from 1, end at 10 (exclusive), step size of 2

print(arr)

# Initialize an array using np.linspace()
arr = np.linspace(0, 1, 5)  # Start from 0, end at 1 (inclusive), with 5 equally spaced values

print(arr)

In [None]:
# Create a NumPy array
arr = np.array([True, False, True, False])

# Check if any element in the array is True
result = np.any(arr)

print(result)

# Check if all elements in the array are True
result = np.all(arr)

print(result)

In [None]:
# Create a NumPy array
arr = np.array([1, 2, 3, 4, 5])

# Find the indices where the elements are greater than 3
indices = np.where(arr > 3)

print(indices)

In [137]:
arr_2d = np.arange(12).reshape(3,4)

arr_2d

array([[ 0,  1,  2,  3],
       [ 4,  5,  6,  7],
       [ 8,  9, 10, 11]])

In [139]:
# some indexing options

print(arr_2d[1,2])
print(arr_2d[1,:])
print(arr_2d[2])
print(arr_2d[2:,-2:])
print(arr_2d[::2,1::2])

6
[4 5 6 7]
[ 8  9 10 11]
[[10 11]]
[[ 1  3]
 [ 9 11]]


In [140]:
# Summing over the different axes

print(np.sum(arr_2d))
print(np.sum(arr_2d, axis=0))
print(np.sum(arr_2d, axis=1))

66
[12 15 18 21]
[ 6 22 38]


In [149]:
# concatenating data using hstack and vstack

print(np.hstack((arr_2d, np.array([[10],[26],[42]]))))
print(np.vstack((arr_2d, np.array([12,14,16,18]))))

[[ 0  1  2  3 10]
 [ 4  5  6  7 26]
 [ 8  9 10 11 42]]
[[ 0  1  2  3]
 [ 4  5  6  7]
 [ 8  9 10 11]
 [12 14 16 18]]


# Getting to know the Pandas library; Series and DataFrame

## Getting to know pandas Series!

In [1]:
# Initialize the Series and DataFrame separately because they are used so much

from pandas import Series, DataFrame
import pandas as pd
import numpy as np

In [2]:
# Generate a Series instance with predefined values

obj = Series([4, 7, -5, 3])

In [3]:
print(obj.values)
print(obj.index)

[ 4  7 -5  3]
RangeIndex(start=0, stop=4, step=1)


In [14]:
# A Series’s index can be altered in place by assignment:

obj.index = ['Bob', 'Steve', 'Jeff', 'Ryan']
print(obj)

Bob      4
Steve    7
Jeff    -5
Ryan     3
dtype: int64


In [4]:
# Now generate a series with predefined indices

obj2 = Series([4, 7, -5, 3], index=['d', 'b', 'a', 'c'])
print(obj2)

d    4
b    7
a   -5
c    3
dtype: int64


In [125]:
obj2 = Series([4, 7, -5, 3], index=['d', 'b', 'a', 'c'])

print(obj2.values)
print(obj2.index)

[ 4  7 -5  3]
Index(['d', 'b', 'a', 'c'], dtype='object')


In [5]:
print(obj2.values)
print(obj2.index)

[ 4  7 -5  3]
Index(['d', 'b', 'a', 'c'], dtype='object')


In [6]:
# Obtain the values by index

print(obj2['a'])
print(obj2['d'])

-5
4


In [7]:
# You can also specify multiple values

print(obj2[['c', 'a', 'd']])

c    3
a   -5
d    4
dtype: int64


In [128]:
print(obj2['b':'c'])

b    7
a   -5
c    3
dtype: int64


In [8]:
# And apply mathematical operations, note that the indices remain unchanged!

print(obj2[obj2 > 0])
print(obj2 * 2)
print(np.exp(obj2))

d    4
b    7
c    3
dtype: int64
d     8
b    14
a   -10
c     6
dtype: int64
d      54.598150
b    1096.633158
a       0.006738
c      20.085537
dtype: float64


In [9]:
# Another way to think about a Series is as a fixed-length, ordered dict, as it is a mapping 
# of index values to data values. It can be substituted into many functions that expect a dict

print('b' in obj2)
print('e' in obj2)

True
False


In [111]:
# Should you have data contained in a Python dict, you can create a Series from it by
# passing the dict:

sdata = {'Ohio': 35000, 'Texas': 71000, 'Oregon': 16000, 'Utah': 5000}
obj3 = Series(sdata)

print(obj3)

Ohio      35000
Texas     71000
Oregon    16000
Utah       5000
dtype: int64


In [113]:
# In the following case, 3 values found in sdata are placed in the appropriate locations, but since
# no value for 'California' is found, it appears as NaN (not a number) which is considered in pandas to mark missing values

states = ['California', 'Ohio', 'Oregon', 'Texas']
obj4 = Series(sdata, index=states) # Unidentified blocks are filled with NaNs (not a number)

print(obj4)

California        NaN
Ohio          35000.0
Oregon        16000.0
Texas         71000.0
dtype: float64


In [110]:
obj3

0      blue
2    purple
4    yellow
dtype: object

In [114]:
# A critical Series feature for many applications is that it automatically aligns differently indexed data in arithmetic operations:

print(obj3 + obj4)

California         NaN
Ohio           70000.0
Oregon         32000.0
Texas         142000.0
Utah               NaN
dtype: float64


In [13]:
# Both the Series object itself and its index have a name attribute, which integrates with
# other key areas of pandas functionality:

obj5 = obj4.copy()
obj5.name = 'population'
obj5.index.name = 'state'

print(obj4)
print(obj5)

California        NaN
Ohio          35000.0
Oregon        16000.0
Texas         71000.0
dtype: float64


state
California        NaN
Ohio          35000.0
Oregon        16000.0
Texas         71000.0
Name: population, dtype: float64

## Getting to know the pandas DataFrame!

In [118]:
# There are numerous ways to construct a DataFrame, though one of the most common
# is from a dict of equal-length lists or NumPy arrays

data = {'state': ['Ohio', 'Ohio', 'Ohio', 'Nevada', 'Nevada'],
 'year': [2000, 2001, 2002, 2001, 2002],
 'pop': [1.5, 1.7, 3.6, 2.4, 2.9]}
frame = DataFrame(data)

frame # this looks nicer than print(frame) if it's the final / only output of the cell, check it out if you don't believe me

Unnamed: 0,state,year,pop
0,Ohio,2000,1.5
1,Ohio,2001,1.7
2,Ohio,2002,3.6
3,Nevada,2001,2.4
4,Nevada,2002,2.9


In [22]:
# If you specify a sequence of columns, the DataFrame’s columns will be exactly what you pass:

frame = DataFrame(data, columns=['year', 'state', 'pop'])

frame

Unnamed: 0,year,state,pop
0,2000,Ohio,1.5
1,2001,Ohio,1.7
2,2002,Ohio,3.6
3,2001,Nevada,2.4
4,2002,Nevada,2.9


In [119]:
# As with Series, if you pass a column that isn’t contained in data, it will appear with NA
# values in the result:

frame2 = DataFrame(data, columns=['year', 'state', 'pop', 'debt'],
    index=['one', 'two', 'three', 'four', 'five'])

frame2

Unnamed: 0,year,state,pop,debt
one,2000,Ohio,1.5,
two,2001,Ohio,1.7,
three,2002,Ohio,3.6,
four,2001,Nevada,2.4,
five,2002,Nevada,2.9,


In [26]:
print(frame2.columns)

Index(['year', 'state', 'pop', 'debt'], dtype='object')


In [122]:
# A column in a DataFrame can be retrieved as a Series either by dict-like notation or by attribute:

print(frame2['state'])
print(frame2.year)

# Note that the returned Series have the same index as the DataFrame, and their name
# attribute has been appropriately set.

one        Ohio
two        Ohio
three      Ohio
four     Nevada
five     Nevada
Name: state, dtype: object
one      2000
two      2001
three    2002
four     2001
five     2002
Name: year, dtype: int64
Ohio
1.7


In [121]:
# Two of the more common types of data selection using DataFrames are .loc and .iloc
# loc is label-based, which means that you have to specify rows and columns based on their row and column labels.

data = DataFrame(np.arange(16).reshape((4, 4)),
    index=['Ohio', 'Colorado', 'Utah', 'New York'],
    columns=['one', 'two', 'three', 'four'])

print(data.loc['Ohio', 'three'])

2
11


In [None]:
# iloc is integer position-based, so you have to specify rows and columns by their integer position values (0-based integer position).

print(data.iloc[2, 3])

In [29]:
# Columns can be modified by assignment. For example, the empty 'debt' column could
# be assigned a scalar value or an array of values:

frame2['debt'] = 16.5

frame2

Unnamed: 0,year,state,pop,debt
one,2000,Ohio,1.5,16.5
two,2001,Ohio,1.7,16.5
three,2002,Ohio,3.6,16.5
four,2001,Nevada,2.4,16.5
five,2002,Nevada,2.9,16.5


In [130]:
frame2['debt'] = np.arange(5.)

frame2

Unnamed: 0,year,state,pop,debt
one,2000,Ohio,1.5,0.0
two,2001,Ohio,1.7,1.0
three,2002,Ohio,3.6,2.0
four,2001,Nevada,2.4,3.0
five,2002,Nevada,2.9,4.0


In [133]:
frame2[frame2['pop']>2]

Unnamed: 0,year,state,pop,debt
three,2002,Ohio,3.6,2.0
four,2001,Nevada,2.4,3.0
five,2002,Nevada,2.9,4.0


In [31]:
# When assigning lists or arrays to a column, the value’s length must match the length
# of the DataFrame. If you assign a Series, it will be instead conformed exactly to the
# DataFrame’s index, inserting missing values in any holes:

val = Series([-1.2, -1.5, -1.7], index=['two', 'four', 'five'])
frame2['debt'] = val

frame2

Unnamed: 0,year,state,pop,debt
one,2000,Ohio,1.5,
two,2001,Ohio,1.7,-1.2
three,2002,Ohio,3.6,
four,2001,Nevada,2.4,-1.5
five,2002,Nevada,2.9,-1.7


In [33]:
# Assigning a column that doesn’t exist will create a new column.

frame2['eastern'] = frame2.state == 'Ohio'

frame2

Unnamed: 0,year,state,pop,debt,eastern
one,2000,Ohio,1.5,,True
two,2001,Ohio,1.7,-1.2,True
three,2002,Ohio,3.6,,True
four,2001,Nevada,2.4,-1.5,False
five,2002,Nevada,2.9,-1.7,False


In [34]:
# The del keyword will delete columns as with a dict:

del frame2['eastern']

In [129]:
frame2

Unnamed: 0,year,state,pop,debt
one,2000,Ohio,1.5,
two,2001,Ohio,1.7,
three,2002,Ohio,3.6,
four,2001,Nevada,2.4,
five,2002,Nevada,2.9,


In [41]:
# Another common form of data is a nested dict of dicts format:
pop = {'Nevada': {2001: 2.4, 2002: 2.9},
    'Ohio': {2000: 1.5, 2001: 1.7, 2002: 3.6}}
    
# If passed to DataFrame, it will interpret the outer dict keys as the columns and the inner
# keys as the row indices:
frame3 = DataFrame(pop)    

frame3

Unnamed: 0,Nevada,Ohio
2001,2.4,1.7
2002,2.9,3.6
2000,,1.5


In [42]:
# Of course you can always transpose the result:

frame3.T

Unnamed: 0,2001,2002,2000
Nevada,2.4,2.9,
Ohio,1.7,3.6,1.5


In [43]:
# The keys in the inner dicts are unioned and sorted to form the index in the result. This
# isn’t true if an explicit index is specified:
DataFrame(pop, index=[2001, 2002, 2003])

Unnamed: 0,Nevada,Ohio
2001,2.4,1.7
2002,2.9,3.6
2003,,


In [44]:
# Dicts of Series are treated much in the same way:
pdata = {'Ohio': frame3['Ohio'][:-1],
         'Nevada': frame3['Nevada'][:2]}

DataFrame(pdata)

Unnamed: 0,Ohio,Nevada
2001,1.7,2.4
2002,3.6,2.9


In [46]:
# If a DataFrame’s index and columns have their name attributes set, these will also be displayed:
frame3.index.name = 'year'; frame3.columns.name = 'state'

frame3

state,Nevada,Ohio
year,Unnamed: 1_level_1,Unnamed: 2_level_1
2001,2.4,1.7
2002,2.9,3.6
2000,,1.5


In [47]:
# Like Series, the values attribute returns the data contained in the DataFrame as a 2D ndarray:

frame3.values

array([[2.4, 1.7],
       [2.9, 3.6],
       [nan, 1.5]])

In [48]:
# If the DataFrame’s columns are different dtypes, the dtype of the values array will be
# chosen to accomodate all of the columns:

frame2.values

array([[2000, 'Ohio', 1.5, nan],
       [2001, 'Ohio', 1.7, -1.2],
       [2002, 'Ohio', 3.6, nan],
       [2001, 'Nevada', 2.4, -1.5],
       [2002, 'Nevada', 2.9, -1.7]], dtype=object)

## Some subtleties with the Index class

In [49]:
# pandas’s Index objects are responsible for holding the axis labels and other metadata
# (like the axis name or names). Any array or other sequence of labels used when constructing
# a Series or DataFrame is internally converted to an Index:

obj = Series(range(3), index=['a', 'b', 'c'])
index = obj.index

# Index objects are immutable and thus can’t be modified by the user:
index[-1] = 3

TypeError: Index does not support mutable operations

## Reindexing

In [50]:
# A critical method on pandas objects is reindex, which means to create a new object
# with the data conformed to a new index. Consider a simple example from above:

obj = Series([4.5, 7.2, -5.3, 3.6], index=['d', 'b', 'a', 'c'])

# Calling reindex on this Series rearranges the data according to the new index, 
# introducing missing values if any index values were not already present:
obj2 = obj.reindex(['a', 'b', 'c', 'd', 'e'])

obj2

a   -5.3
b    7.2
c    3.6
d    4.5
e    NaN
dtype: float64

In [56]:
# For ordered data like time series, it may be desirable to do some interpolation or filling
# of values when reindexing. The method option allows us to do this, using a method such
# as ffill which forward fills the values:

obj3 = Series(['blue', 'purple', 'yellow'], index=[0, 2, 4])

print(obj3)

# forward fitting
obj4 = obj3.reindex(range(6), method='ffill')

print(obj4)

obj5 = obj3.reindex(range(6), method='bfill')

print(obj5)

0      blue
2    purple
4    yellow
dtype: object
0      blue
1      blue
2    purple
3    purple
4    yellow
5    yellow
dtype: object
0      blue
1    purple
2    purple
3    yellow
4    yellow
5       NaN
dtype: object


In [59]:
# With DataFrame, reindex can alter either the (row) index, columns, or both. When
# passed just a sequence, the rows are reindexed in the result;

frame = DataFrame(np.arange(9).reshape((3, 3)), index=['a', 'c', 'd'],
    columns=['Ohio', 'Texas', 'California'])

print(frame)

frame2 = frame.reindex(['a', 'b', 'c', 'd'])

frame2

   Ohio  Texas  California
a     0      1           2
c     3      4           5
d     6      7           8


Unnamed: 0,Ohio,Texas,California
a,0.0,1.0,2.0
b,,,
c,3.0,4.0,5.0
d,6.0,7.0,8.0


In [61]:
# The columns can be reindexed using the columns keyword:

states = ['Texas', 'Utah', 'California']
frame = frame.reindex(columns=states)

print(frame)

   Texas  Utah  California
a      1   NaN           2
c      4   NaN           5
d      7   NaN           8


In [62]:
# Both can be reindexed in one shot, though interpolation will only apply row-wise (axis 0):

frame = frame.reindex(index=['a', 'b', 'c', 'd'], method='ffill',
    columns=states)

In [63]:
frame

Unnamed: 0,Texas,Utah,California
a,1,,2
b,1,,2
c,4,,5
d,7,,8


In [64]:
# Dropping one or more entries from an axis is easy if you have an index array or list
# without those entries. As that can require a bit of munging and set logic, the drop
# method will return a new object with the indicated value or values deleted from an axis:

obj = Series(np.arange(5.), index=['a', 'b', 'c', 'd', 'e'])

new_obj = obj.drop('c')

new_obj

a    0.0
b    1.0
d    3.0
e    4.0
dtype: float64

In [67]:
# With DataFrame, index values can be deleted from either axis:

data = DataFrame(np.arange(16).reshape((4, 4)),
    index=['Ohio', 'Colorado', 'Utah', 'New York'],
    columns=['one', 'two', 'three', 'four'])

data = data.drop(['Colorado', 'Ohio'])
data = data.drop(['two', 'four'], axis=1)

data

Unnamed: 0,one,three
Utah,8,10
New York,12,14


## Indexing, selection, and filtering

In [70]:
# Series indexing (obj[...]) works analogously to NumPy array indexing, except you can
# use the Series’s index values instead of only integers. Here are some examples this:

obj = Series(np.arange(4.), index=['a', 'b', 'c', 'd'])

print(obj['b'])
print(obj[1])
print(obj[2:4])
print(obj[['b', 'a', 'd']])
print(obj[[1, 3]])
print(obj[obj < 2])

1.0
1.0
c    2.0
d    3.0
dtype: float64
b    1.0
a    0.0
d    3.0
dtype: float64
b    1.0
d    3.0
dtype: float64
a    0.0
b    1.0
dtype: float64


In [71]:
# Slicing with labels behaves differently than normal Python slicing in that the endpoint
# is inclusive:

print(obj['b':'c'])

b    1.0
c    2.0
dtype: float64


In [72]:
# Setting using these methods works just as you would expect:

obj['b':'c'] = 5

print(obj)

a    0.0
b    5.0
c    5.0
d    3.0
dtype: float64


In [None]:
# As you’ve seen above, indexing into a DataFrame is for retrieving one or more columns
# either with a single value or sequence:

data = DataFrame(np.arange(16).reshape((4, 4)),
    index=['Ohio', 'Colorado', 'Utah', 'New York'],
    columns=['one', 'two', 'three', 'four'])

print(data['two'])
print(data[['three', 'one']])

In [73]:
# Indexing like this has a few special cases. For example selecting rows by slicing or a boolean array:

print(data[:2])
print(data[data['three'] > 5])

          one  three
Utah        8     10
New York   12     14
          one  three
Utah        8     10
New York   12     14


## Some final useful commands you might use in working with DataFrames

pandas objects are equipped with a set of common mathematical and statistical methods. Most of these fall into the category of reductions or summary statistics, methods that extract a single value (like the sum or mean) from a Series or a Series of values from
the rows or columns of a DataFrame.

In [79]:
df = DataFrame([[1.4, np.nan], [7.1, -4.5],
    [np.nan, np.nan], [0.75, -1.3]],
    index=['a', 'b', 'c', 'd'],
    columns=['one', 'two'])

df

Unnamed: 0,one,two
a,1.4,
b,7.1,-4.5
c,,
d,0.75,-1.3


In [75]:
# Compared with the equivalent methods of vanilla NumPy arrays, they are all built from the ground up to exclude missing data. 
# Calling DataFrame’s sum method returns a Series containing column sums:

print(df.sum())

# Passing axis=1 sums over the rows instead:

print(df.sum(axis=1))

one    9.25
two   -5.80
dtype: float64


a    1.40
b    2.60
c    0.00
d   -0.55
dtype: float64

In [76]:
# NA values are excluded unless the entire slice (row or column in this case) is NA. This
# can be disabled using the skipna option:

print(df.mean(axis=1, skipna=False))

a      NaN
b    1.300
c      NaN
d   -0.275
dtype: float64


In [80]:
# Some methods, like argmin and argmax, return index locations
# where the minimum or maximum values are attained:

print(df.idxmin())

# Others give the minimum or maximum values

print(df.max())

one    d
two    b
dtype: object
one    7.1
two   -1.3
dtype: float64


## More advanced statistical analysis on larger datasets

In [84]:
# Let's import one of seaborn's default datasets which will work nicely with pandas too

import seaborn as sns

iris = sns.load_dataset("iris")

iris

Unnamed: 0,sepal_length,sepal_width,petal_length,petal_width,species
0,5.1,3.5,1.4,0.2,setosa
1,4.9,3.0,1.4,0.2,setosa
2,4.7,3.2,1.3,0.2,setosa
3,4.6,3.1,1.5,0.2,setosa
4,5.0,3.6,1.4,0.2,setosa
...,...,...,...,...,...
145,6.7,3.0,5.2,2.3,virginica
146,6.3,2.5,5.0,1.9,virginica
147,6.5,3.0,5.2,2.0,virginica
148,6.2,3.4,5.4,2.3,virginica


In [88]:
# The corr method of Series computes the correlation of the overlapping, non-NA,
# aligned-by-index values in two Series. Relatedly, cov computes the covariance:

iris_setosa = iris[iris['species'] == 'setosa']

iris_setosa

print(iris.sepal_length.corr(iris.petal_length))
print(iris.sepal_length.cov(iris.petal_length))

0.8717537758865831
1.2743154362416111


In [101]:
# DataFrame’s corr and cov methods, on the other hand, return a full correlation or
# covariance matrix as a DataFrame.

print(iris.corr())

# This doesn't work, we have to filter the strings

ValueError: could not convert string to float: 'setosa'

In [102]:
iris_floats = iris[['sepal_length', 'sepal_width', 'petal_length', 'petal_width']]

In [103]:
print(iris_floats.mean()) # obtain the mean column-wise
print(iris_floats.std()) # and the standard deviation

sepal_length    5.843333
sepal_width     3.057333
petal_length    3.758000
petal_width     1.199333
dtype: float64
sepal_length    0.828066
sepal_width     0.435866
petal_length    1.765298
petal_width     0.762238
dtype: float64


In [135]:
# a very useful command is the dir command, which gives you an overview of all the attributes and methods associated with an object

test_var = 3.0
print(dir(test_var))

# Neglect the attributes and methods starting with either one or two underscores for now, they are typically hidden, but note that
# a float has, amongst others, two attributes real and imag. The overall list is still quite small.

['__abs__', '__add__', '__bool__', '__ceil__', '__class__', '__delattr__', '__dir__', '__divmod__', '__doc__', '__eq__', '__float__', '__floor__', '__floordiv__', '__format__', '__ge__', '__getattribute__', '__getformat__', '__getnewargs__', '__gt__', '__hash__', '__init__', '__init_subclass__', '__int__', '__le__', '__lt__', '__mod__', '__mul__', '__ne__', '__neg__', '__new__', '__pos__', '__pow__', '__radd__', '__rdivmod__', '__reduce__', '__reduce_ex__', '__repr__', '__rfloordiv__', '__rmod__', '__rmul__', '__round__', '__rpow__', '__rsub__', '__rtruediv__', '__setattr__', '__setformat__', '__sizeof__', '__str__', '__sub__', '__subclasshook__', '__truediv__', '__trunc__', 'as_integer_ratio', 'conjugate', 'fromhex', 'hex', 'imag', 'is_integer', 'real']


In [None]:
# If you're brave, try to have a look what attributes and methods a typical dataframe has available, you will be surprised! 
print(dir(iris))

# Extra exercises to get familiar with handling larger datasets

Feel free to load your own datasets, for example using pd.read_csv if you're data is in csv (comma-separated values) format.
Otherwise, feel free to load any of the default seaborn datasets (https://github.com/mwaskom/seaborn-data) to play around with:

In [105]:
import seaborn as sns

# planets = sns.load_dataset("planets")
penguins = sns.load_dataset("penguins")
# fmri = sns.load_dataset("fmri")

# Or any of the other sets available on; https://github.com/mwaskom/seaborn-data

penguins

Unnamed: 0,species,island,bill_length_mm,bill_depth_mm,flipper_length_mm,body_mass_g,sex
0,Adelie,Torgersen,39.1,18.7,181.0,3750.0,Male
1,Adelie,Torgersen,39.5,17.4,186.0,3800.0,Female
2,Adelie,Torgersen,40.3,18.0,195.0,3250.0,Female
3,Adelie,Torgersen,,,,,
4,Adelie,Torgersen,36.7,19.3,193.0,3450.0,Female
...,...,...,...,...,...,...,...
339,Gentoo,Biscoe,,,,,
340,Gentoo,Biscoe,46.8,14.3,215.0,4850.0,Female
341,Gentoo,Biscoe,50.4,15.7,222.0,5750.0,Male
342,Gentoo,Biscoe,45.2,14.8,212.0,5200.0,Female


* Is there on average a weight difference between penguins from different islands? If so, is it significant?
* Is there a correlation between the bill length and bill depth? And with the flipper length?
* Bonus points if you can make a few cool plots using some of these datasets!