In [1]:
from sklearn.datasets import load_boston
boston = load_boston()
print(boston.data.shape)

(506, 13)


In [2]:
type(boston.data)

numpy.ndarray

In [25]:
import pandas as pd
import numpy as np

#### 1. Convert numpy array to pandas DataFrame

In [4]:
pd.DataFrame(data=boston.data, columns=boston.feature_names)

Unnamed: 0,CRIM,ZN,INDUS,CHAS,NOX,RM,AGE,DIS,RAD,TAX,PTRATIO,B,LSTAT
0,0.00632,18.0,2.31,0.0,0.538,6.575,65.2,4.0900,1.0,296.0,15.3,396.90,4.98
1,0.02731,0.0,7.07,0.0,0.469,6.421,78.9,4.9671,2.0,242.0,17.8,396.90,9.14
2,0.02729,0.0,7.07,0.0,0.469,7.185,61.1,4.9671,2.0,242.0,17.8,392.83,4.03
3,0.03237,0.0,2.18,0.0,0.458,6.998,45.8,6.0622,3.0,222.0,18.7,394.63,2.94
4,0.06905,0.0,2.18,0.0,0.458,7.147,54.2,6.0622,3.0,222.0,18.7,396.90,5.33
5,0.02985,0.0,2.18,0.0,0.458,6.430,58.7,6.0622,3.0,222.0,18.7,394.12,5.21
6,0.08829,12.5,7.87,0.0,0.524,6.012,66.6,5.5605,5.0,311.0,15.2,395.60,12.43
7,0.14455,12.5,7.87,0.0,0.524,6.172,96.1,5.9505,5.0,311.0,15.2,396.90,19.15
8,0.21124,12.5,7.87,0.0,0.524,5.631,100.0,6.0821,5.0,311.0,15.2,386.63,29.93
9,0.17004,12.5,7.87,0.0,0.524,6.004,85.9,6.5921,5.0,311.0,15.2,386.71,17.10


In [28]:
pd.__version__

'0.19.2'

#### 2. Series

In [8]:
obj = pd.Series([4, 7, -5, 3])
obj

0    4
1    7
2   -5
3    3
dtype: int64

In [6]:
obj.values, obj.index

(array([ 4,  7, -5,  3]), RangeIndex(start=0, stop=4, step=1))

In [10]:
obj.index = ['Bob', 1, 2, 4]
obj

Bob    4
1      7
2     -5
4      3
dtype: int64

#### 3. Create a Dataframe
- one of the most common is from a dict of equal-length lists or NumPy arrays

In [11]:
data = {'state': ['Ohio', 'Ohio', 'Ohio', 'Nevada', 'Nevada', 'Nevada'],
        'year': [2000, 2001, 2002, 2001, 2002, 2003],
        'pop': [1.5, 1.7, 3.6, 2.4, 2.9, 3.2]}
frame = pd.DataFrame(data)

In [12]:
frame

Unnamed: 0,pop,state,year
0,1.5,Ohio,2000
1,1.7,Ohio,2001
2,3.6,Ohio,2002
3,2.4,Nevada,2001
4,2.9,Nevada,2002
5,3.2,Nevada,2003


#### 4. axis
`axis=0` along the rows (namely, index in pandas), and `axis=1` along the columns

For added clarity, one may choose to specify axis='index' (instead of axis=0) or axis='columns' (instead of axis=1)

In `df.iloc[row, column]`, row is in index position 0 and column is in index position 1.

[What does axis in pandas mean?](https://stackoverflow.com/questions/22149584/what-does-axis-in-pandas-mean)

In [14]:
obj2 = pd.DataFrame(np.arange(9).reshape(3, 3))

In [15]:
obj2

Unnamed: 0,0,1,2
0,0,1,2
1,3,4,5
2,6,7,8


In [16]:
np.sum(obj2, axis=1)  # axis=1 means row-wise，相当于最终保留了行的索引，一个列被当做一个整体来操作

0     3
1    12
2    21
dtype: int64

In [17]:
np.sum(obj2, axis=0)  # axis=0 means column-wise，此时将行当做一个整体来操作

0     9
1    12
2    15
dtype: int64

In [18]:
obj2.drop(2, axis=1)  # 此时将列当做一个整体操作

Unnamed: 0,0,1
0,0,1
1,3,4
2,6,7


#### 5. change column name
- http://pandas.pydata.org/pandas-docs/stable/generated/pandas.DataFrame.rename.html
- https://stackoverflow.com/questions/11346283/renaming-columns-in-pandas

In [19]:
df = pd.DataFrame({"A": [1, 2, 3], "B": [4, 5, 6]})

In [20]:
df.rename(columns={'A': 'haha'})

Unnamed: 0,haha,B
0,1,4
1,2,5
2,3,6


#### 6. remove columns or rows
- https://pandas.pydata.org/pandas-docs/stable/generated/pandas.DataFrame.drop.html

In [26]:
df = pd.DataFrame(np.arange(12).reshape(3,4), columns=['A', 'B', 'C', 'D'])
df

Unnamed: 0,A,B,C,D
0,0,1,2,3
1,4,5,6,7
2,8,9,10,11


In [29]:
df.drop(['B', 'C'], axis=1)

Unnamed: 0,A,D
0,0,3
1,4,7
2,8,11


In [30]:
df.drop([2], axis=0, inplace=True)
df

Unnamed: 0,A,B,C,D
0,0,1,2,3
1,4,5,6,7


#### 7. append
- append a data frame to another data frame
- https://pandas.pydata.org/pandas-docs/stable/generated/pandas.DataFrame.append.html

In [31]:
df = pd.DataFrame([[1, 2], [3, 4]], columns=list('AB'))
df

Unnamed: 0,A,B
0,1,2
1,3,4


In [32]:
df2 = pd.DataFrame([[5, 6], [7, 8]], columns=list('AB'))
df2

Unnamed: 0,A,B
0,5,6
1,7,8


In [33]:
df.append(df2, ignore_index=True)

Unnamed: 0,A,B
0,1,2
1,3,4
2,5,6
3,7,8


#### 8. apply function by column
- https://stackoverflow.com/a/34962199/2803344

In [37]:
df = pd.DataFrame({'a': [1, 2, 3, 4], 'b': [4, 54, 5, 6]})
df

Unnamed: 0,a,b
0,1,4
1,2,54
2,3,5
3,4,6


In [38]:
df['a'].apply(lambda x: x+3)

0    4
1    5
2    6
3    7
Name: a, dtype: int64