# Notes for "Python for Data Analysis"

Firstly, prepare the environment we need throughout the following notes.

In [None]:
import numpy as np
import pandas as pd
from pandas import Series, DataFrame
import matplotlib.pyplot as plt
# this allows plots to appear directly in the notebook
%matplotlib inline

## Whether built-in functions make a copy in basic packages, NumPy and pandas

### Basic packages

#### Make a Copy:

#### Make no copy:

* `.append`： appends a passed obj into the existing list.

In [None]:
a = [1,2,3]
a.append(4)

### NumPy

#### Make a Copy:

* `np.array`: produce array in np.

In [None]:
a = [1, 2, 3]
anp = np.array(a)

* `astype`: produce a new array with a different type.

In [None]:
float_anp = anp.astype(np.float64)

* `[].copy()`: make a copy of selected data

* Boolean index

In [None]:
anp[anp > 2]

* `anp.sort`: sort the array increasingly by default.

* `permutation`: return a random permutation of a sequence.

* `np.concatenate`: concatenate multiple np.nadarray by rows (default) or by columns.

* `np.c_[]` and `np.r_[]`: concatenate objects by columns and by rows respectively. 

In [None]:
np.concatenate((a, a, a))
np.c_[a, a, a]
np.r_[a, a, a]

#### Make no copy:

* `[]`: index labels.

In [None]:
bnp = anp[2]
bnp[0] = np.array([5])

* `T`: transpose.

* `np.random.shuffle`: randomly permute a sequence in place. Notably, It cannot be applied to pass value to other variables.

* `reshape`: Re-shape the np.ndarray.

### pandas

#### Make a copy

* `Series`: generate 1D array in pandas.

In [None]:
apd = Series(a)

* `.reindex`: modify the index of Series or DataFrame, unless the new index is equivalent to the current one and copy=False.

* `.sort_index`: sort the Series or DataFrame by row or column. By default it sort by row index, but can also by row or column values.

In [None]:
frame.sort_index(by = 'b')

* `.order`: to sort a Series by its values.

* `.dropna`: drop NaN from Series or DataFrame.

In [None]:
frame.dropna()
frame.dropna(thresh = 2)  #keep the rows contain a certain number of observations.

* `.fillna`: fill in missing data.

In [None]:
fram.fillna(0)
_=fram.fillna(0, inplace=True)  #replace original frame

* `unstack`: rearrange Series or DataFrame with hierarchical index into a DataFrame.

* `stack`: the inverse operation of unstack.

* `.swaplevel`: interchange the levels.

* `.sortlevel`: sort the specific level.

In [None]:
frame1 = DataFrame(np.arange(12).reshape(4,3), index=[['a','a','b','b'],[1,2,1,2]], 
                   columns=[['Ohio','Ohio','Colorado'],['Green','Red','Green']])
frame2 = frame1.swaplevel(0, 1).sortlevel(0)

* `.set_index`: uses one or more columns of DataFrame as the index.

In [None]:
frame3 = DataFrame({'a': range(7), 'b': range(7,0,-1),
                    'c': np.concatenate((np.repeat('one', 3), np.repeat('two', 4))),
                    'd': [0,1,2,0,1,2,3]})
frame4 = frame3.set_index(['c','d'])

* `.swapaxes`: Interchange axes of DataFrame or pd.Penel.

In [None]:
frame5 = frame.swapaxes(0,1)

* `.to_frame`: convert Series or pd.Panel data to DataFrame.

In [None]:
import pandas.io.data as web
pdata = pd.Panel(dict((stk, web.get_data_yahoo(stk))
                      for stk in ['AAPL', 'GOOG', 'MSFT', 'DELL']))
frame6 = pdata.to_frame

#### Make no copy

* `.index`: call index of an array in pandas.

In [None]:
apdi = apd.index    #apdi cannot be modified

* `.values`: call values of an array in pandas.

* `.ix`: make the DataFrame index system similar to numpy.

In [None]:
frame = {'a': [1,2,3], 'b': [4,5,6]}
b = frame.ix[1]

* `icol, irow`: select single column or row as a Series by integer location.