In [1]:
from sklearn.datasets import load_boston
boston = load_boston()
print(boston.data.shape)

(506, 13)


In [2]:
type(boston.data)

numpy.ndarray

In [3]:
import pandas as pd
import numpy as np

#### 1. Convert numpy array to pandas DataFrame

In [4]:
df_boston = pd.DataFrame(data=boston.data, columns=boston.feature_names)
df_boston.head()

Unnamed: 0,CRIM,ZN,INDUS,CHAS,NOX,RM,AGE,DIS,RAD,TAX,PTRATIO,B,LSTAT
0,0.00632,18.0,2.31,0.0,0.538,6.575,65.2,4.09,1.0,296.0,15.3,396.9,4.98
1,0.02731,0.0,7.07,0.0,0.469,6.421,78.9,4.9671,2.0,242.0,17.8,396.9,9.14
2,0.02729,0.0,7.07,0.0,0.469,7.185,61.1,4.9671,2.0,242.0,17.8,392.83,4.03
3,0.03237,0.0,2.18,0.0,0.458,6.998,45.8,6.0622,3.0,222.0,18.7,394.63,2.94
4,0.06905,0.0,2.18,0.0,0.458,7.147,54.2,6.0622,3.0,222.0,18.7,396.9,5.33


In [5]:
pd.__version__

'0.23.1'

#### 2. Series

In [6]:
obj = pd.Series([4, 7, -5, 3])
obj

0    4
1    7
2   -5
3    3
dtype: int64

In [7]:
obj.values, obj.index

(array([ 4,  7, -5,  3]), RangeIndex(start=0, stop=4, step=1))

In [8]:
obj.index = ['Bob', 1, 2, 4]
obj

Bob    4
1      7
2     -5
4      3
dtype: int64

#### 3. Create a Dataframe
- one of the most common is from a dict of equal-length lists or NumPy arrays

In [9]:
data = {'state': ['Ohio', 'Ohio', 'Ohio', 'Nevada', 'Nevada', 'Nevada'],
        'year': [2000, 2001, 2002, 2001, 2002, 2003],
        'pop': [1.5, 1.7, 3.6, 2.4, 2.9, 3.2]}
frame = pd.DataFrame(data)

In [10]:
frame

Unnamed: 0,state,year,pop
0,Ohio,2000,1.5
1,Ohio,2001,1.7
2,Ohio,2002,3.6
3,Nevada,2001,2.4
4,Nevada,2002,2.9
5,Nevada,2003,3.2


#### 4. axis
`axis=0` along the rows (namely, index in pandas), and `axis=1` along the columns

For added clarity, one may choose to specify axis='index' (instead of axis=0) or axis='columns' (instead of axis=1)

In `df.iloc[row, column]`, row is in index position 0 and column is in index position 1.

[What does axis in pandas mean?](https://stackoverflow.com/questions/22149584/what-does-axis-in-pandas-mean)

In [11]:
obj2 = pd.DataFrame(np.arange(9).reshape(3, 3))

In [12]:
obj2

Unnamed: 0,0,1,2
0,0,1,2
1,3,4,5
2,6,7,8


In [13]:
np.sum(obj2, axis=1)  # axis=1 means row-wise，相当于最终保留了行的索引，一个列被当做一个整体来操作

0     3
1    12
2    21
dtype: int64

In [14]:
np.sum(obj2, axis=0)  # axis=0 means column-wise，此时将行当做一个整体来操作

0     9
1    12
2    15
dtype: int64

In [15]:
obj2.drop(2, axis=1)  # 此时将列当做一个整体操作

Unnamed: 0,0,1
0,0,1
1,3,4
2,6,7


#### 5. change column name
- http://pandas.pydata.org/pandas-docs/stable/generated/pandas.DataFrame.rename.html
- https://stackoverflow.com/questions/11346283/renaming-columns-in-pandas

In [16]:
df = pd.DataFrame({"A": [1, 2, 3], "B": [4, 5, 6]})

In [17]:
df.rename(columns={'A': 'haha'})

Unnamed: 0,haha,B
0,1,4
1,2,5
2,3,6


#### 6. remove columns or rows
- https://pandas.pydata.org/pandas-docs/stable/generated/pandas.DataFrame.drop.html

In [18]:
df = pd.DataFrame(np.arange(12).reshape(3,4), columns=['A', 'B', 'C', 'D'])
df

Unnamed: 0,A,B,C,D
0,0,1,2,3
1,4,5,6,7
2,8,9,10,11


In [19]:
df.drop(['B', 'C'], axis=1)

Unnamed: 0,A,D
0,0,3
1,4,7
2,8,11


In [20]:
df.drop([2], axis=0, inplace=True)
df

Unnamed: 0,A,B,C,D
0,0,1,2,3
1,4,5,6,7


#### 7. append
- append a data frame to another data frame
- https://pandas.pydata.org/pandas-docs/stable/generated/pandas.DataFrame.append.html

In [21]:
df = pd.DataFrame([[1, 2], [3, 4]], columns=list('AB'))
df

Unnamed: 0,A,B
0,1,2
1,3,4


In [22]:
df2 = pd.DataFrame([[5, 6], [7, 8]], columns=list('AB'))
df2

Unnamed: 0,A,B
0,5,6
1,7,8


In [23]:
df.append(df2, ignore_index=True)

Unnamed: 0,A,B
0,1,2
1,3,4
2,5,6
3,7,8


#### 8. apply function by column
单行: https://stackoverflow.com/a/34962199/2803344

In [24]:
df = pd.DataFrame({'a': [1, 2, 3, 4], 'b': [4, 54, 5, 6]})
df

Unnamed: 0,a,b
0,1,4
1,2,54
2,3,5
3,4,6


In [25]:
df['a'].apply(lambda x: x+3)

0    4
1    5
2    6
3    7
Name: a, dtype: int64

多行: https://stackoverflow.com/a/16354730/2803344

In [26]:
def test_fun(row):
    return row['a'] * row['b']

In [27]:
df.apply(test_fun, axis=1)  # 每次按行为一个最小单位

0      4
1    108
2     15
3     24
dtype: int64

#### 9. sort
- http://pandas.pydata.org/pandas-docs/version/0.19/generated/pandas.DataFrame.sort.html

In [28]:
df_boston.sort_values(by=['TAX']).head()

Unnamed: 0,CRIM,ZN,INDUS,CHAS,NOX,RM,AGE,DIS,RAD,TAX,PTRATIO,B,LSTAT
353,0.01709,90.0,2.02,0.0,0.41,6.728,36.1,12.1265,5.0,187.0,17.0,384.46,4.5
123,0.15038,0.0,25.65,0.0,0.581,5.856,97.0,1.9444,2.0,188.0,19.1,370.31,25.41
122,0.09299,0.0,25.65,0.0,0.581,5.961,92.9,2.0869,2.0,188.0,19.1,378.09,17.93
126,0.38735,0.0,25.65,0.0,0.581,5.613,95.6,1.7572,2.0,188.0,19.1,359.29,27.26
125,0.16902,0.0,25.65,0.0,0.581,5.986,88.4,1.9929,2.0,188.0,19.1,385.02,14.81


In [29]:
result = df_boston.sort_values(by=['CRIM', 'AGE'], ascending=[1, 0])
result.head()

Unnamed: 0,CRIM,ZN,INDUS,CHAS,NOX,RM,AGE,DIS,RAD,TAX,PTRATIO,B,LSTAT
0,0.00632,18.0,2.31,0.0,0.538,6.575,65.2,4.09,1.0,296.0,15.3,396.9,4.98
284,0.00906,90.0,2.97,0.0,0.4,7.088,20.8,7.3073,1.0,285.0,15.3,394.72,7.85
285,0.01096,55.0,2.25,0.0,0.389,6.453,31.9,7.3073,1.0,300.0,15.3,394.72,8.23
341,0.01301,35.0,1.52,0.0,0.442,7.241,49.3,7.0379,1.0,284.0,15.5,394.74,5.49
55,0.01311,90.0,1.22,0.0,0.403,7.249,21.9,8.6966,5.0,226.0,17.9,395.93,4.81


#### 10. merge
- `right_on`='column_name', `right_index`=True
- https://pandas.pydata.org/pandas-docs/stable/generated/pandas.DataFrame.merge.html

In [30]:
df1 = pd.DataFrame({'key': ['b', 'b', 'a', 'c', 'a', 'a', 'b'], 'data1': range(7)})
df2 = pd.DataFrame({'key': ['a', 'b', 'd'], 'data2': range(3)})

In [31]:
df1

Unnamed: 0,key,data1
0,b,0
1,b,1
2,a,2
3,c,3
4,a,4
5,a,5
6,b,6


In [32]:
df2

Unnamed: 0,key,data2
0,a,0
1,b,1
2,d,2


In [33]:
pd.merge(df1, df2)  # by index

Unnamed: 0,key,data1,data2
0,b,0,1
1,b,1,1
2,b,6,1
3,a,2,0
4,a,4,0
5,a,5,0


In [34]:
pd.merge(df1, df2, on='key')  # by key

Unnamed: 0,key,data1,data2
0,b,0,1
1,b,1,1
2,b,6,1
3,a,2,0
4,a,4,0
5,a,5,0


keep index when using pandas merge(在merge时，保留index)
- https://stackoverflow.com/questions/11976503/how-to-keep-index-when-using-pandas-merge

In [35]:
a = pd.DataFrame({'col1': {'a': 1, 'b': 2, 'c': 3}, 
                          'to_merge_on': {'a': 1, 'b': 3, 'c': 4}})
a

Unnamed: 0,col1,to_merge_on
a,1,1
b,2,3
c,3,4


In [36]:
b = pd.DataFrame({'col2': {0: 1, 1: 2, 2: 3}, 
                          'to_merge_on': {0: 1, 1: 3, 2: 5}})
b

Unnamed: 0,col2,to_merge_on
0,1,1
1,2,3
2,3,5


**reset_index()的功能**
```
>>> df
         class  max_speed
falcon    bird      389.0
parrot    bird       24.0
lion    mammal       80.5
monkey  mammal        NaN
```
When we reset the index, the old index is added as a column, and a
new sequential index is used:
```
>>> df.reset_index()
    index   class  max_speed
0  falcon    bird      389.0
1  parrot    bird       24.0
2    lion  mammal       80.5
3  monkey  mammal        NaN
```
We can use the `drop` parameter to avoid the old index being added as
a column:
```
>>> df.reset_index(drop=True)
    class  max_speed
0    bird      389.0
1    bird       24.0
2  mammal       80.5
3  mammal        NaN
```

In [37]:
a.reset_index().merge(b, how="left").set_index(a.index)

Unnamed: 0,index,col1,to_merge_on,col2
a,a,1,1,1.0
b,b,2,3,2.0
c,c,3,4,


#### 11. set a column as index

In [38]:
df = pd.DataFrame({'month': [1, 4, 7, 10],
                   'year': [2012, 2014, 2013, 2014],
                   'sale':[55, 40, 84, 31]})
df

Unnamed: 0,month,year,sale
0,1,2012,55
1,4,2014,40
2,7,2013,84
3,10,2014,31


In [39]:
df.set_index('month')

Unnamed: 0_level_0,year,sale
month,Unnamed: 1_level_1,Unnamed: 2_level_1
1,2012,55
4,2014,40
7,2013,84
10,2014,31


In [40]:
df

Unnamed: 0,month,year,sale
0,1,2012,55
1,4,2014,40
2,7,2013,84
3,10,2014,31


#### 12. 相关系数
- https://stackoverflow.com/questions/3949226/calculating-pearson-correlation-and-significance-in-python
- https://pandas.pydata.org/pandas-docs/stable/generated/pandas.DataFrame.corr.html

In [41]:
a = [[1, 2, 3],
     [5, 6, 9],
     [5, 6, 11],
     [5, 6, 13],
     [5, 3, 13]]
df = pd.DataFrame(data=a)
df

Unnamed: 0,0,1,2
0,1,2,3
1,5,6,9
2,5,6,11
3,5,6,13
4,5,3,13


In [42]:
df.corr()  # Compute pairwise correlation of columns, excluding NA/null values

Unnamed: 0,0,1,2
0,1.0,0.745601,0.916579
1,0.745601,1.0,0.544248
2,0.916579,0.544248,1.0


In [43]:
np.corrcoef(df[0], df[2])

array([[ 1.        ,  0.91657857],
       [ 0.91657857,  1.        ]])

#### 13. groupby
- https://pandas.pydata.org/pandas-docs/stable/generated/pandas.DataFrame.groupby.html
- https://pandas.pydata.org/pandas-docs/stable/generated/pandas.core.groupby.GroupBy.apply.html

In [44]:
df3 = pd.DataFrame(columns=['key', 'data1', 'data2'], 
                   data={'key': ['a', 'b', 'a', 'a', 'a'], 
                         'data1': ['663-65', '2086-36', '1405-66', '543-15', '543-15'],
                         'data2': ['1405-66', '543-15', '543-15', '543-15', '543-15']})
df3

Unnamed: 0,key,data1,data2
0,a,663-65,1405-66
1,b,2086-36,543-15
2,a,1405-66,543-15
3,a,543-15,543-15
4,a,543-15,543-15


In [45]:
def merge2str(group):
    return ', '.join(group.drop_duplicates().tolist())

In [46]:
def add_str_to_df(df):
    df['data1'] = df['data1'] + 'ss'
    df['data2'] = df['data2'] + 'aa'
    return df

In [47]:
# apply different function to multiple columns
df3.groupby('key').aggregate({'data1': merge2str, 'data2': merge2str})

Unnamed: 0_level_0,data1,data2
key,Unnamed: 1_level_1,Unnamed: 2_level_1
a,"663-65, 1405-66, 543-15","1405-66, 543-15"
b,2086-36,543-15


In [48]:
# apply a function to a single column
df3.groupby('key')['data1'].apply(merge2str)

key
a    663-65, 1405-66, 543-15
b                    2086-36
Name: data1, dtype: object

In [49]:
# apply a function to the whole dataframe
df3.groupby('key').apply(add_str_to_df)

Unnamed: 0,key,data1,data2
0,a,663-65ss,1405-66aa
1,b,2086-36ss,543-15aa
2,a,1405-66ss,543-15aa
3,a,543-15ss,543-15aa
4,a,543-15ss,543-15aa


#### 14. 添加新行到DataFrame
- 如果添加的行非常多，速度会很慢，这时可以使用下面的方式以`CSV`的格式写入内存，可以极大的提高速度
- https://stackoverflow.com/a/48287388/2803344
- https://stackoverflow.com/a/37974772/2803344

In [50]:
import io
import csv

output_file_handle = io.StringIO()  # 创建一个写入内存的文件句柄
csv_writer = csv.writer(output_file_handle)  # 创建一个csv writer
for i in range(5):
    csv_writer.writerow([i, 2, 3, 4])  # 按行写入
output_file_handle.seek(0)  # 这一句很重要，让读文件的指针回到起始位置
pd.read_csv(output_file_handle, header=None)  # 从内存中读取csv文件

Unnamed: 0,0,1,2,3
0,0,2,3,4
1,1,2,3,4
2,2,2,3,4
3,3,2,3,4
4,4,2,3,4
