In [2]:
import pandas as pd
import numpy as np

np.random.seed(1234)

# Mainly talking about Pandas and Numpy, distinct from Python

## Fast

In [3]:
%%time
x = np.random.random(100_000_000)
x

CPU times: user 1.2 s, sys: 250 ms, total: 1.45 s
Wall time: 1.45 s


## Like a lot of Python, it's highly readable
- naming of methods and syntax is really clear
- appending methods together make an operation like reading a sentence

In [4]:
%%time
pd.DataFrame({
    'v1' : x[:500_000]
}).to_csv("~/Desktop/test.csv")

CPU times: user 1.97 s, sys: 62.5 ms, total: 2.04 s
Wall time: 1.51 s


In [5]:
%%time
x2 = pd.read_csv("~/Desktop/test.csv", index_col=0)

CPU times: user 408 ms, sys: 43.8 ms, total: 452 ms
Wall time: 269 ms


In [6]:
np.allclose(x[:500_000]
            , x2.v1.values
            , atol=6)

True

# Separation of Concerns
### Code and Output are kept separate

which means I can
- version control my code
- rerun my code
- store data separately from code
- refresh data and keep code same or vice versa

In [73]:
x2.head()

Unnamed: 0,v1
0,0.087477
1,0.418285
2,0.621524
3,0.909306
4,0.228845


# Vectorized Code means you don't have to manage For-loops

### You think in terms of the transformations, not all the step to make the transformation

In [42]:
%%time

## when people say python is really slow
## they mean programming python the way you program lower-level languages
yy = []
for i in range(len(x)):
    yy.append(x[i]+2)
yy

CPU times: user 46.7 s, sys: 1.68 s, total: 48.3 s
Wall time: 48.9 s


In [8]:
%time ly = [v+2 for v in x]

CPU times: user 26.4 s, sys: 1.33 s, total: 27.7 s
Wall time: 27.9 s


In [3]:
%%time 
ly = (v+2 for v in np.random.random(100_000_000))
ly2 = np.fromiter(ly,np.float)

CPU times: user 31.6 s, sys: 639 ms, total: 32.3 s
Wall time: 32.3 s


In [35]:
%time y = x+2

CPU times: user 492 ms, sys: 456 ms, total: 949 ms
Wall time: 244 ms


In [36]:
%time z = pd.Series(x+2)

CPU times: user 439 ms, sys: 411 ms, total: 850 ms
Wall time: 213 ms


# Different Type Objects have different types of Functionality out of the Box
### - Can feel like a gotcha until you see the value of special tools
### - Simplifies code incredibly
### - Leverages really powerful C bindings

In [37]:
type(y), type(z), type(x2)

(numpy.ndarray, pandas.core.series.Series, pandas.core.frame.DataFrame)

In [16]:
y[:5]

array([ 2.08747744,  2.41828462,  2.62152356,  2.90930634,  2.22884529])

In [72]:
z.head()

0    2.087477
1    2.418285
2    2.621524
3    2.909306
4    2.228845
dtype: float64

In [43]:
x2.head()

Unnamed: 0,v1
0,0.087477
1,0.418285
2,0.621524
3,0.909306
4,0.228845


## Similar Syntax thru-out Python

#### Pandas

In [37]:
%time z.mean()

CPU times: user 153 ms, sys: 4.09 ms, total: 157 ms
Wall time: 157 ms


2.5000145782194156

#### Numpy

In [40]:
%time y.mean()

CPU times: user 58.2 ms, sys: 1.85 ms, total: 60 ms
Wall time: 58.5 ms


2.500014578220217

#### Numpy on a List

In [43]:
%time np.mean(yy)

CPU times: user 7.26 s, sys: 3.8 s, total: 11.1 s
Wall time: 11.5 s


2.500014578220217

#### Pandas on a List

In [44]:
%time pd.Series(yy).mean()

CPU times: user 9.67 s, sys: 2.62 s, total: 12.3 s
Wall time: 12.9 s


2.5000145782194156

# Value of Index 

## Pandas has a lot in common with SQL

### https://pandas.pydata.org/pandas-docs/stable/comparison_with_sql.html

In [67]:
zz = z.head()

zz

0    2.087477
1    2.418285
2    2.621524
3    2.909306
4    2.228845
dtype: float64

In [70]:
pd.concat([zz.head(4), zz.tail(4)],1)

Unnamed: 0,0,1
0,2.087477,
1,2.418285,2.418285
2,2.621524,2.621524
3,2.909306,2.909306
4,,2.228845
