In [None]:
# work with 'olive.csv' in 'data/' directory

### Q1: *Import* numpy and pandas

In [None]:
# import modules
import sys
import numpy as np
import pandas as pd
# enable display of multiple outputs in one cell
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"

### Q2: *Set options* in pandas 
>* display 30 `max_rows` 
>* no limit on `max_columns`
>* precision is 3

In [None]:
# set options
pd.set_option('display.max_rows', 30)
pd.set_option('display.max_columns', None)
pd.set_option('display.precision', 3)

### Q3: *Get the versions* of pandas and numpy
> **DOUBLE** underscore!! '__'

In [None]:
# print version
np.__version__
pd.__version__

### Q4: *Read in* the 'olive.csv' file, assign it to variable `df`

In [None]:
sys.path # check if 'olive.csv' is in Python path

In [None]:
# read data from cvs file
# get an initial feel of how the data looks like
df = pd.read_csv('olive.csv')
df.head()

### Q5: How many *rows & columns* are there in `df`

In [None]:
df.shape
df.info()
len(df['region'].unique())
len(df['area'].unique())
df.iloc[:,3:].info()

### Q6: Get the *first 2 rows* of `df`

In [None]:
df.iloc[:2,:]

### Q7: *How many distinct* dtypes are there in `df`

In [None]:
df.dtypes
len(df.dtypes.unique())

### Q8: In `df`, + a *copy* of `Unnamed: 0` *column*
> as 'sub_region_raw' column

In [None]:
df['sub_region_raw'] = df['Unnamed: 0']
df

### Q9: In `df`, *rename columns*
> Note: must use`inplace=True` flag❗

In [None]:
df.rename(columns = {'Unnamed: 0':'sub_region_desc'}, inplace=True)
df

In [None]:
df.rename(columns = {'area':'sub_region'}, inplace=True)
df

### Q10: Get the *unique values* of `region` *column*

In [None]:
df.region.unique()

### Q11: In the `sub_region` *column*, *how many unique* values are there?

In [None]:
len(df.sub_region.unique())

### Q13: Lets take a look at the `sub_region_desc` column:

In [None]:
df.head()

Looks like 'sub_region_desc' has line numbers attached to the beginning of region name.  
>* [Working with Text Data](https://pandas.pydata.org/pandas-docs/stable/user_guide/text.html#text-string-methods)  
>* Remove those leading line numbers and  
>* Get the **unique values in the column**

In [None]:
# split string in sub_region_desc by '.'
# get the second part
# get unique values
df['sub_region_desc'] = (df['sub_region_desc']
                         .str.split('.')
                         .str[1]
                         .unique()
                        )