# 第 2 章　Python 与 Jupyter Notebook 基础｜用 Python 动手学统计学

## 第 4 节　认识 numpy 与 pandas

### 1. 导入用于分析的功能

In [1]:
import numpy as np
import pandas as pd

### 3. 实现：列表

In [2]:
sample_list = [1,2,3,4,5]
sample_list

[1, 2, 3, 4, 5]

### 5. 实现：数组

In [3]:
sample_array = np.array([1,2,3,4,5])
sample_array

array([1, 2, 3, 4, 5])

In [4]:
sample_array + 2

array([3, 4, 5, 6, 7])

In [5]:
sample_array * 2

array([ 2,  4,  6,  8, 10])

In [6]:
np.array([1 ,2, "A"])

array(['1', '2', 'A'],
      dtype='<U11')

In [20]:
# 矩阵
sample_array_2 = np.array(
    [[1,2,3,4,5],
     [6,7,8,9,10]])
sample_array_2

array([[ 1,  2,  3,  4,  5],
       [ 6,  7,  8,  9, 10]])

In [21]:
# 获取行数与列数
sample_array_2.shape

(2, 5)

### 6. 实现：生成等差数列的方法

In [8]:
np.arange(start = 1, stop = 6, step = 1)

array([1, 2, 3, 4, 5])

In [9]:
np.arange(start = 0.1, stop = 0.8, step = 0.2)

array([ 0.1,  0.3,  0.5,  0.7])

In [10]:
np.arange(0.1, 0.8, 0.2)

array([ 0.1,  0.3,  0.5,  0.7])

### 7. 实现：多种生成数组的方式

In [11]:
# 元素相同的数组
np.tile("A", 5)

array(['A', 'A', 'A', 'A', 'A'],
      dtype='<U1')

In [12]:
# 存放 4 个 0
np.tile(0, 4)

array([0, 0, 0, 0])

In [13]:
# 只有 0 的数组
np.zeros(4)

array([ 0.,  0.,  0.,  0.])

In [14]:
# 二维数组
np.zeros([2,3])

array([[ 0.,  0.,  0.],
       [ 0.,  0.,  0.]])

In [15]:
# 只有 1 的数组
np.ones(3)

array([ 1.,  1.,  1.])

### 8. 实现：切片

In [19]:
# 一维数组
d1_array = np.array([1,2,3,4,5])
d1_array

array([1, 2, 3, 4, 5])

In [17]:
# 取得第一个元素
d1_array[0]

1

In [18]:
# 获取索引中的 1 号和 2 号元素
d1_array[1:3]

array([2, 3])

In [17]:
# 二维数组
d2_array = np.array(
    [[1,2,3,4,5],
    [6,7,8,9,10]])
d2_array

array([[ 1,  2,  3,  4,  5],
       [ 6,  7,  8,  9, 10]])

In [18]:
d2_array[0, 3]

4

In [19]:
d2_array[1, 2:4]

array([8, 9])

### 9. 实现：数据帧

In [20]:
sample_df = pd.DataFrame({
    'col1' : sample_array, 
    'col2' : sample_array * 2,
    'col3' : ["A", "B", "C", "D", "E"]
})
print(sample_df)

   col1  col2 col3
0     1     2    A
1     2     4    B
2     3     6    C
3     4     8    D
4     5    10    E


In [21]:
sample_df

Unnamed: 0,col1,col2,col3
0,1,2,A
1,2,4,B
2,3,6,C
3,4,8,D
4,5,10,E


### 10. 实现：读取文件中的数据

In [22]:
file_data = pd.read_csv("2-4-1-sample_data.csv")
print(file_data)

   col1 col2
0     1    A
1     2    A
2     3    B
3     4    B
4     5    C
5     6    C


In [23]:
type(file_data)

pandas.core.frame.DataFrame

### 11. 实现：连接数据帧

In [24]:
df_1 = pd.DataFrame({
    'col1' : np.array([1, 2, 3]),
    'col2' : np.array(["A", "B", "C"])
})
df_2 = pd.DataFrame({
    'col1' : np.array([4, 5, 6]),
    'col2' : np.array(["D", "E", "F"])
})

In [25]:
# 在纵向上连接
print(pd.concat([df_1, df_2]))

   col1 col2
0     1    A
1     2    B
2     3    C
0     4    D
1     5    E
2     6    F


In [26]:
# 在横向上连接
print(pd.concat([df_1, df_2], axis = 1))

   col1 col2  col1 col2
0     1    A     4    D
1     2    B     5    E
2     3    C     6    F


### 12. 实现：数据帧的列操作

In [27]:
# 对象数据
print(sample_df)

   col1  col2 col3
0     1     2    A
1     2     4    B
2     3     6    C
3     4     8    D
4     5    10    E


In [28]:
# 按列名获取数据
print(sample_df.col2)

0     2
1     4
2     6
3     8
4    10
Name: col2, dtype: int32


In [29]:
print(sample_df["col2"])

0     2
1     4
2     6
3     8
4    10
Name: col2, dtype: int32


In [30]:
print(sample_df[["col2", "col3"]])

   col2 col3
0     2    A
1     4    B
2     6    C
3     8    D
4    10    E


In [31]:
# 删除指定的列
print(sample_df.drop("col1", axis = 1))

   col2 col3
0     2    A
1     4    B
2     6    C
3     8    D
4    10    E


### 13. 实现：数据帧的行操作

In [32]:
# 获取前 3 行
print(sample_df.head(n = 3))

   col1  col2 col3
0     1     2    A
1     2     4    B
2     3     6    C


In [33]:
# 获取第 1 行
print(sample_df.query('index == 0'))

   col1  col2 col3
0     1     2    A


In [34]:
# 通过多种条件获取数据
print(sample_df.query('col3 == "A"'))

   col1  col2 col3
0     1     2    A


In [35]:
# 按 OR 条件获取数据
print(sample_df.query('col3 == "A" | col3 == "D"'))

   col1  col2 col3
0     1     2    A
3     4     8    D


In [36]:
# 按 AND 条件获取数据
print(sample_df.query('col3 == "A" & col1 == 3'))

Empty DataFrame
Columns: [col1, col2, col3]
Index: []


In [37]:
# 同时指定行和列的条件
print(sample_df.query('col3 == "A"')[["col2", "col3"]])

   col2 col3
0     2    A


### 14. 补充：序列

In [38]:
type(sample_df)

pandas.core.frame.DataFrame

In [39]:
type(sample_df.col1)

pandas.core.series.Series

In [40]:
# 转换为数组
type(np.array(sample_df.col1))

numpy.ndarray

In [41]:
type(sample_df.col1.values)

numpy.ndarray

### 15. 补充：函数文档

In [42]:
help(sample_df.query)

Help on method query in module pandas.core.frame:

query(expr, inplace=False, **kwargs) method of pandas.core.frame.DataFrame instance
    Query the columns of a frame with a boolean expression.
    
    .. versionadded:: 0.13
    
    Parameters
    ----------
    expr : string
        The query string to evaluate.  You can refer to variables
        in the environment by prefixing them with an '@' character like
        ``@a + b``.
    inplace : bool
        Whether the query should modify the data in place or return
        a modified copy
    
        .. versionadded:: 0.18.0
    
    kwargs : dict
        See the documentation for :func:`pandas.eval` for complete details
        on the keyword arguments accepted by :meth:`DataFrame.query`.
    
    Returns
    -------
    q : DataFrame
    
    Notes
    -----
    The result of the evaluation of this expression is first passed to
    :attr:`DataFrame.loc` and if that fails because of a
    multidimensional key (e.g., a DataFrame) 