# Agenda

1. Data frames (2D data)
2. Reading (and writing) files -- real-world data!

To download: https://files.lerner.co.il/data-science-exercise-files.zip

In [2]:
import numpy as np
import pandas as pd
from pandas import Series, DataFrame

When we import a module, we're basically asking Python to do the following:

1. Find the module (ending with "py") on disk
2. Load it into memory
3. Cache it, so that we don't need to load it a second time
4. Define the module as a variable in our global namespace

The second time we use import, we just jump directly to step 4.



In [4]:
import sys
sys.modules['pandas']  # sys.modules is the cache that Python uses for modules

<module 'pandas' from '/usr/local/lib/python3.11/site-packages/pandas/__init__.py'>

What about "from import"?

In that case, it takes a slightly different route:

1. Find the module (ending with "py") on disk
2. Load it into memory
3. Cache it, so that we don't need to load it a second time
4. Defines only the names we've specified in our global namespace


In [5]:
from random import randint

In [6]:
sys.modules['random']

<module 'random' from '/usr/local/Cellar/python@3.11/3.11.3/Frameworks/Python.framework/Versions/3.11/lib/python3.11/random.py'>

In [7]:
# If I want to create a data frame...

# list of lists
df = DataFrame([[10, 20, 30, 40],
               [50, 60, 70, 80],
               [90, 100, 110, 120]])
df

Unnamed: 0,0,1,2,3
0,10,20,30,40
1,50,60,70,80
2,90,100,110,120


As with a series, we have an index -- along the left column, describing our rows

We also have columns, which are along the top row, describing the columns.

By default, both are numbered starting at 0.

We can set one or both by passing "index=" or "columns=" when we create teh data frame. And yes, we can modify those down the road.

In [8]:
df = DataFrame([[10, 20, 30, 40],
               [50, 60, 70, 80],
               [90, 100, 110, 120]],
              index=list('abc'),
              columns=list('wxyz'))
df

Unnamed: 0,w,x,y,z
a,10,20,30,40
b,50,60,70,80
c,90,100,110,120


In [9]:
# retrieving a row -- we use .loc and .iloc

df.loc['a']

w    10
x    20
y    30
z    40
Name: a, dtype: int64

In [10]:
df.loc[['a', 'c']]   # fancy indexing -- request more than one row

Unnamed: 0,w,x,y,z
a,10,20,30,40
c,90,100,110,120


In [11]:
# I can also use .iloc

df.iloc[1]

w    50
x    60
y    70
z    80
Name: b, dtype: int64

In [12]:
# how can I retrieve one or more columns? Use []
df['w']

a    10
b    50
c    90
Name: w, dtype: int64

In [13]:
# can I get more than one columns? Yes!
df[['w', 'y']]

Unnamed: 0,w,y
a,10,30
b,50,70
c,90,110


In [14]:
# what about numbering our columns?  that doesn't really happen.

# what about slices of rows?
df.loc['a':'c']

Unnamed: 0,w,x,y,z
a,10,20,30,40
b,50,60,70,80
c,90,100,110,120


In [15]:
df.loc['a':'c':2]

Unnamed: 0,w,x,y,z
a,10,20,30,40
c,90,100,110,120


In [17]:
# you don't need .loc when asking for a slice!
# yes, this is hugely inconsistent

df['a':'c':2]

Unnamed: 0,w,x,y,z
a,10,20,30,40
c,90,100,110,120


In [18]:
df

Unnamed: 0,w,x,y,z
a,10,20,30,40
b,50,60,70,80
c,90,100,110,120


Let's say that I want a particular row and a particular column. I'll use .loc for this, as well -- I'll just use its two-argument version:

    df.loc[ROW_SELECTOR, COLUMN_SELECTOR]
    
The row selector (and column selector, for that matter), can be:

1. An individual row name
2. A list of row names (indexes)
3. A boolean series/list

This is the standard way to work with data frames.

In [19]:
# if I want row b, column y

df.loc['b', 'y']

70

In [20]:
# if I want row b, columns y and z

df.loc['b', ['y', 'z']]

y    70
z    80
Name: b, dtype: int64

In [21]:
# if I want rows a and c, columns y and z

df.loc[['a', 'c'], ['y', 'z']]

Unnamed: 0,y,z
a,30,40
c,110,120


In [22]:
# you can write it on two lines

df.loc[
    ['a', 'c'],   # row selector
    ['y', 'z']    # column selector
]

Unnamed: 0,y,z
a,30,40
c,110,120


In [26]:
# I can use a boolean index here

df.loc[
    df['x'] > df['x'].mean(),   # row selector
    ['y', 'z']                  # column selector
]

Unnamed: 0,y,z
c,110,120


# Exercises with data frames

1. Create a 5x5 data frame with rows abcde and columns vwxyz. The values should be random integers from 0-1,000. (You can use a 2D NumPy array for this, if you want.)

2. Retrieve row b
3. Retrieve rows b and d
4. Retrieve rows b, c, and d
5. Retrieve column w
6. Retrieve columns w and y
7. Retrieve columns w, x, and y
8. Retrieve the item at row e, column v


In [27]:
np.random.seed(0)  # this sets the random number generator

In [29]:
np.random.randint(0, 100, 25).reshape(5, 5)

array([[ 9, 20, 80, 69, 79],
       [47, 64, 82, 99, 88],
       [49, 29, 19, 19, 14],
       [39, 32, 65,  9, 57],
       [32, 31, 74, 23, 35]])

In [30]:
np.random.randint(0, 100, [5,5])

array([[75, 55, 28, 34,  0],
       [ 0, 36, 53,  5, 38],
       [17, 79,  4, 42, 58],
       [31,  1, 65, 41, 57],
       [35, 11, 46, 82, 91]])

In [34]:
# 1. Create a 5x5 data frame with rows abcde and columns vwxyz. The values should be random integers from 0-1,000. (You can use a 2D NumPy array for this, if you want.)

np.random.seed(0)
df = DataFrame(np.random.randint(0, 1000, [5,5]),
         index=list('abcde'),
         columns=list('vwxyz'))
df

Unnamed: 0,v,w,x,y,z
a,684,559,629,192,835
b,763,707,359,9,723
c,277,754,804,599,70
d,472,600,396,314,705
e,486,551,87,174,600


In [35]:
# 2. Retrieve row b

df.loc['b']

v    763
w    707
x    359
y      9
z    723
Name: b, dtype: int64

In [36]:
# 3. Retrieve rows b and d

df.loc[['b', 'd']]

Unnamed: 0,v,w,x,y,z
b,763,707,359,9,723
d,472,600,396,314,705


In [37]:
df.loc['b':'d':2]

Unnamed: 0,v,w,x,y,z
b,763,707,359,9,723
d,472,600,396,314,705


In [38]:
# 4. Retrieve rows b, c, and d

df.loc['b':'d']

Unnamed: 0,v,w,x,y,z
b,763,707,359,9,723
c,277,754,804,599,70
d,472,600,396,314,705


In [39]:
df['b':'d']  # slice on a data frame gives us the rows

Unnamed: 0,v,w,x,y,z
b,763,707,359,9,723
c,277,754,804,599,70
d,472,600,396,314,705


In [40]:
%timeit df.loc['b':'d']

625 µs ± 66.2 µs per loop (mean ± std. dev. of 7 runs, 1,000 loops each)


In [41]:
%timeit df['b':'d'] 

545 µs ± 78.6 µs per loop (mean ± std. dev. of 7 runs, 1,000 loops each)


In [42]:
# 5. Retrieve column w

df['w']

a    559
b    707
c    754
d    600
e    551
Name: w, dtype: int64

In [43]:
# 6. Retrieve columns w and y

df[['w', 'y']]

Unnamed: 0,w,y
a,559,192
b,707,9
c,754,599
d,600,314
e,551,174


In [44]:
# 7. Retrieve columns w, x, and y

df[['w', 'x', 'y']]

Unnamed: 0,w,x,y
a,559,629,192
b,707,359,9
c,754,804,599
d,600,396,314
e,551,87,174


In [45]:
# 8. Retrieve the item at row e, column v

df.loc[
      'e', # row selector
       'v' # column selector
] 

486

In [48]:
df.loc[:, 'w':'y']

Unnamed: 0,w,x,y
a,559,629,192
b,707,359,9
c,754,804,599
d,600,396,314
e,551,87,174


In [49]:
# dtypes -- columns are series

# every column in a data frame is actually a series behind the scenes

df['v'].dtype

dtype('int64')

In [50]:
df.loc['a'].dtype

dtype('int64')

In [51]:
# I can get the dtypes of all columns with the "dtypes" attribute

df.dtypes

v    int64
w    int64
x    int64
y    int64
z    int64
dtype: object

In [52]:
# we can assign to values in a data frame via .loc!

df

Unnamed: 0,v,w,x,y,z
a,684,559,629,192,835
b,763,707,359,9,723
c,277,754,804,599,70
d,472,600,396,314,705
e,486,551,87,174,600


In [53]:
df.loc['c', 'y']

599

In [54]:
df.loc['c', 'y'] = 222

df

Unnamed: 0,v,w,x,y,z
a,684,559,629,192,835
b,763,707,359,9,723
c,277,754,804,222,70
d,472,600,396,314,705
e,486,551,87,174,600


In [55]:
df.loc[['a', 'c'], ['x', 'y']]

Unnamed: 0,x,y
a,629,192
c,804,222


In [56]:
df.loc[['a', 'c'], ['x', 'y']] = 333
df

Unnamed: 0,v,w,x,y,z
a,684,559,333,333,835
b,763,707,359,9,723
c,277,754,333,333,70
d,472,600,396,314,705
e,486,551,87,174,600


In [60]:
# change the order of columns, if you want
df = df[['y', 'z', 'v', 'x', 'w']]

In [61]:
df

Unnamed: 0,y,z,v,x,w
a,333,835,684,333,559
b,9,723,763,359,707
c,333,70,277,333,754
d,314,705,472,396,600
e,174,600,486,87,551


# Some other ways to create data frames

1. List of lists
2. 2D NumPy array
3. List of dicts -- the dict keys specify the column names, and each list is a row
4. Dict of lists -- the dict keys specify the column 

In [57]:
# adding rows/columns (including columns of arrays)
# removing rows and columns
# setting/unsetting the index
# queries based on rows/columns