<a href="https://colab.research.google.com/github/Teoroo-CMC/DoE_Course_Material/blob/main/Week_1/Workshop_3/Using_Pandas.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# A few short introductory exercises in NumPy

In [None]:
import numpy as np

# Create a NumPy array
data = np.array([1, 2, 3, 4, 5, 6, 7, 8, 9, 10])

# Slice the array to get elements from index 2 to 6
sliced_data = data[2:7]

print(sliced_data)

In [None]:
# Create a NumPy array
data = np.array([1, 2, 3, 4, 5])

# Append new data to the array
new_data = np.append(data, [6, 7, 8])

print(new_data)

In [None]:
# Initialize an empty array with shape (3, 3)
empty_array = np.empty((3, 3))

print(empty_array)

# Create a NumPy array
data = np.array([[1, 2, 3], [4, 5, 6]])

# Initialize an array with zeros of the same shape as 'data'
zeros_array = np.zeros_like(data)

print(zeros_array)

In [None]:
# Initialize an array using np.arange()
arr = np.arange(1, 10, 2)  # Start from 1, end at 10 (exclusive), step size of 2

print(arr)

# Initialize an array using np.linspace()
arr = np.linspace(0, 1, 5)  # Start from 0, end at 1 (inclusive), with 5 equally spaced values

print(arr)

In [None]:
# Create a NumPy array
arr = np.array([True, False, True, False])

# Check if any element in the array is True
result = np.any(arr)

print(result)

# Check if all elements in the array are True
result = np.all(arr)

print(result)

In [None]:
# Create a NumPy array
arr = np.array([1, 2, 3, 4, 5])

# Find the indices where the elements are greater than 3
indices = np.where(arr > 3)

print(indices)

In [None]:
arr_2d = np.arange(12).reshape(3,4)

arr_2d

In [None]:
# some indexing options

print(arr_2d[1,2])
print(arr_2d[1,:])
print(arr_2d[2])
print(arr_2d[2:,-2:])
print(arr_2d[::2,1::2])

In [None]:
# Summing over the different axes

print(np.sum(arr_2d))
print(np.sum(arr_2d, axis=0))
print(np.sum(arr_2d, axis=1))

In [None]:
# concatenating data using hstack and vstack

print(np.hstack((arr_2d, np.array([[10],[26],[42]]))))
print(np.vstack((arr_2d, np.array([12,14,16,18]))))

# Getting to know the Pandas library; Series and DataFrame

## Getting to know pandas Series!

In [None]:
# Initialize the Series and DataFrame separately because they are used so much

from pandas import Series, DataFrame
import pandas as pd
import numpy as np

In [None]:
# Generate a Series instance with predefined values

obj = Series([4, 7, -5, 3])

In [None]:
print(obj.values)
print(obj.index)

In [None]:
# A Series’s index can be altered in place by assignment:

obj.index = ['Bob', 'Steve', 'Jeff', 'Ryan']
print(obj)

In [None]:
# Now generate a series with predefined indices

obj2 = Series([4, 7, -5, 3], index=['d', 'b', 'a', 'c'])
print(obj2)

In [None]:
obj2 = Series([4, 7, -5, 3], index=['d', 'b', 'a', 'c'])

print(obj2.values)
print(obj2.index)

In [None]:
print(obj2.values)
print(obj2.index)

In [None]:
# Obtain the values by index

print(obj2['a'])
print(obj2['d'])

In [None]:
# You can also specify multiple values

print(obj2[['c', 'a', 'd']])

In [None]:
print(obj2['b':'c'])

In [None]:
# And apply mathematical operations, note that the indices remain unchanged!

print(obj2[obj2 > 0])
print(obj2 * 2)
print(np.exp(obj2))

In [None]:
# Another way to think about a Series is as a fixed-length, ordered dict, as it is a mapping 
# of index values to data values. It can be substituted into many functions that expect a dict

print('b' in obj2)
print('e' in obj2)

In [None]:
# Should you have data contained in a Python dict, you can create a Series from it by
# passing the dict:

sdata = {'Ohio': 35000, 'Texas': 71000, 'Oregon': 16000, 'Utah': 5000}
obj3 = Series(sdata)

print(obj3)

In [None]:
# In the following case, 3 values found in sdata are placed in the appropriate locations, but since
# no value for 'California' is found, it appears as NaN (not a number) which is considered in pandas to mark missing values

states = ['California', 'Ohio', 'Oregon', 'Texas']
obj4 = Series(sdata, index=states) # Unidentified blocks are filled with NaNs (not a number)

print(obj4)

In [None]:
obj3

In [None]:
# A critical Series feature for many applications is that it automatically aligns differently indexed data in arithmetic operations:

print(obj3 + obj4)

In [None]:
# Both the Series object itself and its index have a name attribute, which integrates with
# other key areas of pandas functionality:

obj5 = obj4.copy()
obj5.name = 'population'
obj5.index.name = 'state'

print(obj4)
print(obj5)

## Getting to know the pandas DataFrame!

In [None]:
# There are numerous ways to construct a DataFrame, though one of the most common
# is from a dict of equal-length lists or NumPy arrays

data = {'state': ['Ohio', 'Ohio', 'Ohio', 'Nevada', 'Nevada'],
 'year': [2000, 2001, 2002, 2001, 2002],
 'pop': [1.5, 1.7, 3.6, 2.4, 2.9]}
frame = DataFrame(data)

frame # this looks nicer than print(frame) if it's the final / only output of the cell, check it out if you don't believe me

In [None]:
# If you specify a sequence of columns, the DataFrame’s columns will be exactly what you pass:

frame = DataFrame(data, columns=['year', 'state', 'pop'])

frame

In [None]:
# As with Series, if you pass a column that isn’t contained in data, it will appear with NA
# values in the result:

frame2 = DataFrame(data, columns=['year', 'state', 'pop', 'debt'],
    index=['one', 'two', 'three', 'four', 'five'])

frame2

In [None]:
print(frame2.columns)

In [None]:
# A column in a DataFrame can be retrieved as a Series either by dict-like notation or by attribute:

print(frame2['state'])
print(frame2.year)

# Note that the returned Series have the same index as the DataFrame, and their name
# attribute has been appropriately set.

In [None]:
# Two of the more common types of data selection using DataFrames are .loc and .iloc
# loc is label-based, which means that you have to specify rows and columns based on their row and column labels.

data = DataFrame(np.arange(16).reshape((4, 4)),
    index=['Ohio', 'Colorado', 'Utah', 'New York'],
    columns=['one', 'two', 'three', 'four'])

print(data.loc['Ohio', 'three'])

In [None]:
# iloc is integer position-based, so you have to specify rows and columns by their integer position values (0-based integer position).

print(data.iloc[2, 3])

In [None]:
# Columns can be modified by assignment. For example, the empty 'debt' column could
# be assigned a scalar value or an array of values:

frame2['debt'] = 16.5

frame2

In [None]:
frame2['debt'] = np.arange(5.)

frame2

In [None]:
frame2[frame2['pop']>2]

In [None]:
# When assigning lists or arrays to a column, the value’s length must match the length
# of the DataFrame. If you assign a Series, it will be instead conformed exactly to the
# DataFrame’s index, inserting missing values in any holes:

val = Series([-1.2, -1.5, -1.7], index=['two', 'four', 'five'])
frame2['debt'] = val

frame2

In [None]:
# Assigning a column that doesn’t exist will create a new column.

frame2['eastern'] = frame2.state == 'Ohio'

frame2

In [None]:
# The del keyword will delete columns as with a dict:

del frame2['eastern']

In [None]:
frame2

In [None]:
# Another common form of data is a nested dict of dicts format:
pop = {'Nevada': {2001: 2.4, 2002: 2.9},
    'Ohio': {2000: 1.5, 2001: 1.7, 2002: 3.6}}
    
# If passed to DataFrame, it will interpret the outer dict keys as the columns and the inner
# keys as the row indices:
frame3 = DataFrame(pop)    

frame3

In [None]:
# Of course you can always transpose the result:

frame3.T

In [None]:
# The keys in the inner dicts are unioned and sorted to form the index in the result. This
# isn’t true if an explicit index is specified:
DataFrame(pop, index=[2001, 2002, 2003])

In [None]:
# Dicts of Series are treated much in the same way:
pdata = {'Ohio': frame3['Ohio'][:-1],
         'Nevada': frame3['Nevada'][:2]}

DataFrame(pdata)

In [None]:
# If a DataFrame’s index and columns have their name attributes set, these will also be displayed:
frame3.index.name = 'year'; frame3.columns.name = 'state'

frame3

In [None]:
# Like Series, the values attribute returns the data contained in the DataFrame as a 2D ndarray:

frame3.values

In [None]:
# If the DataFrame’s columns are different dtypes, the dtype of the values array will be
# chosen to accomodate all of the columns:

frame2.values

## Some subtleties with the Index class

In [None]:
# pandas’s Index objects are responsible for holding the axis labels and other metadata
# (like the axis name or names). Any array or other sequence of labels used when constructing
# a Series or DataFrame is internally converted to an Index:

obj = Series(range(3), index=['a', 'b', 'c'])
index = obj.index

# Index objects are immutable and thus can’t be modified by the user:
index[-1] = 3

## Reindexing

In [None]:
# A critical method on pandas objects is reindex, which means to create a new object
# with the data conformed to a new index. Consider a simple example from above:

obj = Series([4.5, 7.2, -5.3, 3.6], index=['d', 'b', 'a', 'c'])

# Calling reindex on this Series rearranges the data according to the new index, 
# introducing missing values if any index values were not already present:
obj2 = obj.reindex(['a', 'b', 'c', 'd', 'e'])

obj2

In [None]:
# For ordered data like time series, it may be desirable to do some interpolation or filling
# of values when reindexing. The method option allows us to do this, using a method such
# as ffill which forward fills the values:

obj3 = Series(['blue', 'purple', 'yellow'], index=[0, 2, 4])

print(obj3)

# forward fitting
obj4 = obj3.reindex(range(6), method='ffill')

print(obj4)

obj5 = obj3.reindex(range(6), method='bfill')

print(obj5)

In [None]:
# With DataFrame, reindex can alter either the (row) index, columns, or both. When
# passed just a sequence, the rows are reindexed in the result;

frame = DataFrame(np.arange(9).reshape((3, 3)), index=['a', 'c', 'd'],
    columns=['Ohio', 'Texas', 'California'])

print(frame)

frame2 = frame.reindex(['a', 'b', 'c', 'd'])

frame2

In [None]:
# The columns can be reindexed using the columns keyword:

states = ['Texas', 'Utah', 'California']
frame = frame.reindex(columns=states)

print(frame)

In [None]:
# Both can be reindexed in one shot, though interpolation will only apply row-wise (axis 0):

frame = frame.reindex(index=['a', 'b', 'c', 'd'], method='ffill',
    columns=states)

In [None]:
frame

In [None]:
# Dropping one or more entries from an axis is easy if you have an index array or list
# without those entries. As that can require a bit of munging and set logic, the drop
# method will return a new object with the indicated value or values deleted from an axis:

obj = Series(np.arange(5.), index=['a', 'b', 'c', 'd', 'e'])

new_obj = obj.drop('c')

new_obj

In [None]:
# With DataFrame, index values can be deleted from either axis:

data = DataFrame(np.arange(16).reshape((4, 4)),
    index=['Ohio', 'Colorado', 'Utah', 'New York'],
    columns=['one', 'two', 'three', 'four'])

data = data.drop(['Colorado', 'Ohio'])
data = data.drop(['two', 'four'], axis=1)

data

## Indexing, selection, and filtering

In [None]:
# Series indexing (obj[...]) works analogously to NumPy array indexing, except you can
# use the Series’s index values instead of only integers. Here are some examples this:

obj = Series(np.arange(4.), index=['a', 'b', 'c', 'd'])

print(obj['b'])
print(obj[1])
print(obj[2:4])
print(obj[['b', 'a', 'd']])
print(obj[[1, 3]])
print(obj[obj < 2])

In [None]:
# Slicing with labels behaves differently than normal Python slicing in that the endpoint
# is inclusive:

print(obj['b':'c'])

In [None]:
# Setting using these methods works just as you would expect:

obj['b':'c'] = 5

print(obj)

In [None]:
# As you’ve seen above, indexing into a DataFrame is for retrieving one or more columns
# either with a single value or sequence:

data = DataFrame(np.arange(16).reshape((4, 4)),
    index=['Ohio', 'Colorado', 'Utah', 'New York'],
    columns=['one', 'two', 'three', 'four'])

print(data['two'])
print(data[['three', 'one']])

In [None]:
# Indexing like this has a few special cases. For example selecting rows by slicing or a boolean array:

print(data[:2])
print(data[data['three'] > 5])

## Some final useful commands you might use in working with DataFrames

pandas objects are equipped with a set of common mathematical and statistical methods. Most of these fall into the category of reductions or summary statistics, methods that extract a single value (like the sum or mean) from a Series or a Series of values from
the rows or columns of a DataFrame.

In [None]:
df = DataFrame([[1.4, np.nan], [7.1, -4.5],
    [np.nan, np.nan], [0.75, -1.3]],
    index=['a', 'b', 'c', 'd'],
    columns=['one', 'two'])

df

In [None]:
# Compared with the equivalent methods of vanilla NumPy arrays, they are all built from the ground up to exclude missing data. 
# Calling DataFrame’s sum method returns a Series containing column sums:

print(df.sum())

# Passing axis=1 sums over the rows instead:

print(df.sum(axis=1))

In [None]:
# NA values are excluded unless the entire slice (row or column in this case) is NA. This
# can be disabled using the skipna option:

print(df.mean(axis=1, skipna=False))

In [None]:
# Some methods, like argmin and argmax, return index locations
# where the minimum or maximum values are attained:

print(df.idxmin())

# Others give the minimum or maximum values

print(df.max())

## More advanced statistical analysis on larger datasets

In [None]:
# Let's import one of seaborn's default datasets which will work nicely with pandas too

import seaborn as sns

iris = sns.load_dataset("iris")

iris

In [None]:
# The corr method of Series computes the correlation of the overlapping, non-NA,
# aligned-by-index values in two Series. Relatedly, cov computes the covariance:

iris_setosa = iris[iris['species'] == 'setosa']

iris_setosa

print(iris.sepal_length.corr(iris.petal_length))
print(iris.sepal_length.cov(iris.petal_length))

In [None]:
# DataFrame’s corr and cov methods, on the other hand, return a full correlation or
# covariance matrix as a DataFrame.

print(iris.corr())

# This doesn't work, we have to filter the strings

In [None]:
iris_floats = iris[['sepal_length', 'sepal_width', 'petal_length', 'petal_width']]

In [None]:
print(iris_floats.mean()) # obtain the mean column-wise
print(iris_floats.std()) # and the standard deviation

In [None]:
# a very useful command is the dir command, which gives you an overview of all the attributes and methods associated with an object

test_var = 3.0
print(dir(test_var))

# Neglect the attributes and methods starting with either one or two underscores for now, they are typically hidden, but note that
# a float has, amongst others, two attributes real and imag. The overall list is still quite small.

In [None]:
# If you're brave, try to have a look what attributes and methods a typical dataframe has available, you will be surprised! 
print(dir(iris))

# Extra exercises to get familiar with handling larger datasets

Feel free to load your own datasets, for example using pd.read_csv if you're data is in csv (comma-separated values) format.
Otherwise, feel free to load any of the default seaborn datasets (https://github.com/mwaskom/seaborn-data) to play around with:

In [None]:
import seaborn as sns

# planets = sns.load_dataset("planets")
penguins = sns.load_dataset("penguins")
# fmri = sns.load_dataset("fmri")

# Or any of the other sets available on; https://github.com/mwaskom/seaborn-data

penguins

* Is there on average a weight difference between penguins from different islands? If so, is it significant?
* Is there a correlation between the bill length and bill depth? And with the flipper length?
* Bonus points if you can make a few cool plots using some of these datasets!