In [1]:
import numpy as np
import pandas as pd

# 1 `Series`


## 1.1 创建 Series 数据结构

`pd.Series(data = None, index = None, dtype = None, name = None, copy = False)`

data：这是 Series 的数据部分，可以是列表、NumPy 数组、字典等。它是 Series 中的实际数据内容。可以是以下几种形式：

- 列表或数组：直接传递一个包含数据的列表或 NumPy 数组。

- 字典：将字典的值作为数据，字典的键可以成为 Series 的索引标签。

- 标量值：如果只传递一个标量值，它会被广播到 Series 的所有元素。

index：这是 Series 的索引部分，它用于标识数据的标签或标识。可以是以下几种形式：

- 列表、数组或其他可迭代对象：用于指定 Series 的索引标签。

- 如果未提供索引参数，将默认使用整数索引。

dtype：这是 Series 的数据类型，用于指定 Series 中数据的数据类型。默认情况下，pandas 会尝试推断数据类型。你可以明确指定数据类型，例如 dtype='int64' 或 dtype='float64'。

name：这是 Series 对象的名称，可以给 Series 对象起一个可识别的名称。它通常用于标识数据的含义。例如，如果你创建了一个表示温度的 Series，你可以将其命名为 'Temperature'。

copy：这是一个布尔值，用于控制是否复制传递给 Series 的数据。默认情况下，如果数据是可变的（例如列表），则会创建数据的副本，以避免原始数据的不受控制的更改。设置为 True 会始终复制数据，而设置为 False 则会尝试共享数据的引用。


### 1.1.1 使用列表作为数据源创建 Series


In [2]:
ar_list = [1, 2, 3, 4, 5]
print(type(ar_list))
print("-" * 35)

# 使用列表创建 Series
s1 = pd.Series(ar_list)
print(s1)
print("-" * 35)
print(type(s1))

<class 'list'>
-----------------------------------
0    1
1    2
2    3
3    4
4    5
dtype: int64
-----------------------------------
<class 'pandas.core.series.Series'>


### 1.1.2 使用数组作为数据源创建 Series


In [3]:
array = np.arange(6)
print(array)
print(type(array))
print("-" * 35)

# 创建 Series
s1 = pd.Series(array, index=["a", "b", "c", "d", "e", "f"])
print(s1)
print("-" * 35)
print(type(s1))

print("-" * 35)

# 输出标签和值，标签可以强制转换为 list
print(list(s1.index), "\t", s1.values)
print("-" * 35)
print("type(s1.values) = ", type(s1.values))

[0 1 2 3 4 5]
<class 'numpy.ndarray'>
-----------------------------------
a    0
b    1
c    2
d    3
e    4
f    5
dtype: int64
-----------------------------------
<class 'pandas.core.series.Series'>
-----------------------------------
['a', 'b', 'c', 'd', 'e', 'f'] 	 [0 1 2 3 4 5]
-----------------------------------
type(s1.values) =  <class 'numpy.ndarray'>


### 1.1.3 使用字典作为数据源创建 Series

如果用字典预先指定了 index，但是构造 Series 的时候又传递了 index 数组作为参数，此时发生冲突的 index 的值会被赋值为 NaN

也可以利用 index 传递一个 dict 键的重排列，来构建一个重新排序的 Series


In [4]:
dict = {"a": 1, "b": 2, "c": 3, "d": 4, "e": 5}

s1 = pd.Series(dict)

print(s1)

a    1
b    2
c    3
d    4
e    5
dtype: int64


In [5]:
# 传递的 index 数组与构造 Series 的字典键冲突
dict = {"a": 1, "b": 2, "c": 3}
s1 = pd.Series(dict, index=["x", "b", "z"])
print(s1)

x    NaN
b    2.0
z    NaN
dtype: float64


In [6]:
# 利用 index 重排
dict = {"a": 1, "b": 2, "c": 3}
s1 = pd.Series(dict, index=["c", "a", "b"])
print(s1)

c    3
a    1
b    2
dtype: int64


### 1.1.4 参数`name`


In [7]:
dict_1 = {"Beijing": 1000, "Shanghai": 2000, "Guangzhou": 3000}
s1 = pd.Series(dict, name="Population")
s1.index.name = "City"
print(s1)
print("-" * 35)
dict_2 = {"Guangzhou": 38.3, "Haerbin": 20.3, "Chengdu": 28.7}
s2 = pd.Series(dict_2, name="Temperature")
s2.index.name = "City"
print(s2)
print("-" * 35)
df = pd.DataFrame({"Population": s1, "Temperature": s2})
print(df)
print("-" * 35)

print(df["Population"])

City
a    1
b    2
c    3
Name: Population, dtype: int64
-----------------------------------
City
Guangzhou    38.3
Haerbin      20.3
Chengdu      28.7
Name: Temperature, dtype: float64
-----------------------------------
           Population  Temperature
City                              
Chengdu           NaN         28.7
Guangzhou         NaN         38.3
Haerbin           NaN         20.3
a                 1.0          NaN
b                 2.0          NaN
c                 3.0          NaN
-----------------------------------
City
Chengdu      NaN
Guangzhou    NaN
Haerbin      NaN
a            1.0
b            2.0
c            3.0
Name: Population, dtype: float64


## 1.2 索引


### 1.2.1 通过标签索引

当索引标签不存在时，索引会报错，但是可以进行赋值，相当于新增数据

想索引多个标签的时候，需要将索引标签放进一个数组中, e.g. s1[['a', 'b', 'c']]


In [8]:
array = np.arange(6)

s1 = pd.Series(array, index=["a", "b", "c", "d", "e", "f"])
print(s1)
print("-" * 35)

print(s1["c"])

print("-" * 35)

# 新增数据
s1["love"] = "you"
print(s1)

print("-" * 35)
print(s1[["a", "c", "love"]])

a    0
b    1
c    2
d    3
e    4
f    5
dtype: int64
-----------------------------------
2
-----------------------------------
a         0
b         1
c         2
d         3
e         4
f         5
love    you
dtype: object
-----------------------------------
a         0
c         2
love    you
dtype: object


### 1.2.2 通过位置索引

如果标签同样类型为 int，那么此时的索引优先匹配标签，如果标签数据类型不是 int，则按照位置从 0 开始递增索引

`一般不建议采用位置索引，标签索引是最安全的`


In [9]:
array = np.arange(6)

s1 = pd.Series(array)

s2 = pd.Series(array, index=[i for i in range(1, 7)])

s3 = pd.Series(array, index=[i for i in "abcdef"])

print("s1 = \n", s1)
print("-" * 35)
print("s2 = \n", s2)
print("-" * 35)
print("s3 = \n", s3)
print("-" * 35)

# 开始尝试位置索引
print("s1[2] = ", s1[2])
print("s2[2] = ", s2[2])
print("s3[2] = ", s3[2])

s1 = 
 0    0
1    1
2    2
3    3
4    4
5    5
dtype: int64
-----------------------------------
s2 = 
 1    0
2    1
3    2
4    3
5    4
6    5
dtype: int64
-----------------------------------
s3 = 
 a    0
b    1
c    2
d    3
e    4
f    5
dtype: int64
-----------------------------------
s1[2] =  2
s2[2] =  1
s3[2] =  2


  print("s3[2] = ", s3[2])


## 1.3 切片

当位置索引与标签索引恰好一致，即 `index = [0,1,2,3,...] `时，切片不包含末端，否则标签索引切片包含末端，位置索引切片不包含末端


In [10]:
# 位置索引与标签索引恰好一致
s1 = pd.Series(np.arange(6))
print(s1)
print("-" * 35)
print(s1[0:3])

# 采用标签索引，包含末端
s2 = pd.Series({"a": 1, "b": 2, "c": 3, "d": 4, "e": 5})
print(s2)
print("-" * 35)
print(s2["b":"d"])
print("-" * 35)

# 采用位置索引，不包含末端
print(s2[1:3])

0    0
1    1
2    2
3    3
4    4
5    5
dtype: int64
-----------------------------------
0    0
1    1
2    2
dtype: int64
a    1
b    2
c    3
d    4
e    5
dtype: int64
-----------------------------------
b    2
c    3
d    4
dtype: int64
-----------------------------------
b    2
c    3
dtype: int64


## 1.3 基本操作


### 1.3.1 查看前几条和后几条数据

`Series.head(int) / Series.tail(int)`


In [11]:
s = pd.Series(np.random.rand(15))
print(s.head(3))
print("-" * 35)
print(s.tail(3))

0    0.114446
1    0.947889
2    0.850779
dtype: float64
-----------------------------------
12    0.751800
13    0.091359
14    0.465568
dtype: float64


### 1.3.2 重排索引

`Series.reindex(list/ndarray, fill_value = NaN)`

如果新的索引中出现了原索引中没有出现过的索引，则产生新对象，并默认赋值为 NaN


In [12]:
s1 = pd.Series(np.random.rand(5), index=list("abcde"))
print(s1)
print("-" * 35)

s2 = s1.reindex(list("acdeb"))
print(s2)
print("-" * 35)

s3 = s1.reindex(list("cdepq"), fill_value=0)
print(s3)

a    0.264608
b    0.357120
c    0.634176
d    0.574796
e    0.565667
dtype: float64
-----------------------------------
a    0.264608
c    0.634176
d    0.574796
e    0.565667
b    0.357120
dtype: float64
-----------------------------------
c    0.634176
d    0.574796
e    0.565667
p    0.000000
q    0.000000
dtype: float64


### 1.3.3 对齐运算

可以按照标签索引对齐运算，没有对齐的标签索引对应的值为 NaN


In [13]:
s1 = pd.Series(np.random.rand(3), index=list("abc"))
s2 = pd.Series(np.random.rand(4), index=list("bcde"))
print(s1)
print("-" * 35)
print(s2)
print("-" * 35)
print(s1 + s2)

a    0.638953
b    0.358544
c    0.006646
dtype: float64
-----------------------------------
b    0.796181
c    0.829006
d    0.411556
e    0.273360
dtype: float64
-----------------------------------
a         NaN
b    1.154725
c    0.835652
d         NaN
e         NaN
dtype: float64


### 1.3.4 删除索引和值

`Series.drop(index, inplace = False)`

False 返回删除后的新 Series，原始 Series 不改变，True 返回 None，原始 Series 改变


In [14]:
s = pd.Series(np.random.rand(3), index=list("abc"))
print("s = \n", s)
print("-" * 35)

s1 = s.drop("a")
print("s1 = \n", s1)
print("-" * 35)
print("s = \n", s)
print("-" * 35)

s2 = s.drop("b", inplace=True)
print("s2 = ", s2)
print("-" * 35)
print("s = \n", s)

s = 
 a    0.566813
b    0.901056
c    0.954543
dtype: float64
-----------------------------------
s1 = 
 b    0.901056
c    0.954543
dtype: float64
-----------------------------------
s = 
 a    0.566813
b    0.901056
c    0.954543
dtype: float64
-----------------------------------
s2 =  None
-----------------------------------
s = 
 a    0.566813
c    0.954543
dtype: float64


### 1.3.5 添加索引和值

直接给新的索引赋值即可


In [15]:
s = pd.Series(np.random.rand(3), index=list("abc"))
print(s)
print("-" * 35)

s["love"] = "you"
print(s)

a    0.358333
b    0.161135
c    0.112032
dtype: float64
-----------------------------------
a       0.358333
b       0.161135
c       0.112032
love         you
dtype: object


# 2 `DataFrame`


## 2.1 创建 DataFrame

`pd.DataFrame(data, index = None, columns = None, dtype = None, copy = False)`


### 2.1.1 使用列表创建 DataFrame


In [16]:
list1 = [1, 2, 3, 4, 5]
df = pd.DataFrame(list1)
print(df)

   0
0  1
1  2
2  3
3  4
4  5


### 2.1.2 使用嵌套列表创建 DataFrame

一个列表对应一行，列名单独写


In [17]:
df = pd.DataFrame([["a", 1], ["b", 2], ["c", 3]], columns=["letter", "number"])
print(df)

  letter  number
0      a       1
1      b       2
2      c       3


### 2.1.3 使用列表嵌套字典创建 DataFrame

一个列表对应一行，列名为字典的键


In [18]:
df = pd.DataFrame(
    [{"Char": "a", "Capital": "A"}, {"Char": "b", "Capital": "B", "Num": "2"}],
    index=["Sample 1", "Sample 2"],
)
print(df)

# 如果只想显示部分列，可以使用 columns 参数
df_partial = pd.DataFrame(df, columns=["Char", "Capital"], copy=True)
print(df_partial)

         Char Capital  Num
Sample 1    a       A  NaN
Sample 2    b       B    2
         Char Capital
Sample 1    a       A
Sample 2    b       B


### 2.1.4 使用 Series 创建 DataFrame


In [19]:
age = pd.Series([10, 20, 30], index=["A", "B", "C"], dtype=np.int64)
salary = pd.Series([1000, 2000, 3000, 4000], index=["A", "B", "C", "D"])
data = {"Age": age, "Salary": salary}
df = pd.DataFrame(data)
print(df)

    Age  Salary
A  10.0    1000
B  20.0    2000
C  30.0    3000
D   NaN    4000


## 2.2 列操作


### 2.2.1 选取数据列

直接索引列名就可以取出数据列


In [20]:
age = pd.Series([10, 20, 30], index=["A", "B", "C"], dtype=np.int64)
salary = pd.Series([1000, 2000, 3000, 4000], index=["A", "B", "C", "D"])
gender = pd.Series(["Male", "Female", "Male", "Male"], index=["A", "B", "C", "D"])
data = {"Age": age, "Gender": gender, "Salary": salary}
df = pd.DataFrame(data)
print(df)

print("=" * 35)
print(df["Salary"])

print("=" * 35)
print(df[["Age", "Salary"]])

    Age  Gender  Salary
A  10.0    Male    1000
B  20.0  Female    2000
C  30.0    Male    3000
D   NaN    Male    4000
A    1000
B    2000
C    3000
D    4000
Name: Salary, dtype: int64
    Age  Salary
A  10.0    1000
B  20.0    2000
C  30.0    3000
D   NaN    4000


### 2.2.2 增加数据列


#### 2.2.2.1 通过索引直接添加一个 Series


In [21]:
age = pd.Series([10, 20, 30], index=["A", "B", "C"], dtype=np.int64)
salary = pd.Series([1000, 2000, 3000, 4000], index=["A", "B", "C", "D"])
gender = pd.Series(["Male", "Female", "Male", "Male"], index=["A", "B", "C", "D"])
data = {"Age": age, "Gender": gender, "Salary": salary}
df = pd.DataFrame(data)
print(df)

print("=" * 35)

index = pd.Series([1, 2, 3, 4], index=["A", "B", "C", "D"])
df["Index"] = index
print(df)

    Age  Gender  Salary
A  10.0    Male    1000
B  20.0  Female    2000
C  30.0    Male    3000
D   NaN    Male    4000
    Age  Gender  Salary  Index
A  10.0    Male    1000      1
B  20.0  Female    2000      2
C  30.0    Male    3000      3
D   NaN    Male    4000      4


#### 2.2.2.2 通过 `insert()` 方法添加

`df.insert(loc, column, value, allow_duplicates = False)`

- loc：整型，插人索引，必须验证 0<=loc<=len(列)
- column： 插入列的标签，类型可以是（字符串/数字/散列对象）
- value：数值，Series 或者数组
- allow duplicates：允许重复，可以有相同的列标签数据，默认为 False


In [22]:
age = pd.Series([10, 20, 30], index=["A", "B", "C"], dtype=np.int64)
salary = pd.Series([1000, 2000, 3000, 4000], index=["A", "B", "C", "D"])
gender = pd.Series(["Male", "Female", "Male", "Male"], index=["A", "B", "C", "D"])
data = {"Age": age, "Gender": gender, "Salary": salary}
df = pd.DataFrame(data)
print(df)

print("=" * 35)

scores = pd.Series([100, 99, 98, 97], index=["A", "B", "C", "D"])
df.insert(2, column="Scores", value=scores, allow_duplicates=False)
print(df)

    Age  Gender  Salary
A  10.0    Male    1000
B  20.0  Female    2000
C  30.0    Male    3000
D   NaN    Male    4000
    Age  Gender  Scores  Salary
A  10.0    Male     100    1000
B  20.0  Female      99    2000
C  30.0    Male      98    3000
D   NaN    Male      97    4000


### 2.2.3 删除数据列


#### 2.2.3.1 通过 `del` 方法删除


In [23]:
age = pd.Series([10, 20, 30], index=["A", "B", "C"], dtype=np.int64)
salary = pd.Series([1000, 2000, 3000, 4000], index=["A", "B", "C", "D"])
gender = pd.Series(["Male", "Female", "Male", "Male"], index=["A", "B", "C", "D"])
data = {"Age": age, "Gender": gender, "Salary": salary}
df = pd.DataFrame(data)
print(df)

print("=" * 35)

del df["Salary"]
print(df)

    Age  Gender  Salary
A  10.0    Male    1000
B  20.0  Female    2000
C  30.0    Male    3000
D   NaN    Male    4000
    Age  Gender
A  10.0    Male
B  20.0  Female
C  30.0    Male
D   NaN    Male


#### 2.2.3.2 通过 `pop()` 方法删除

这种方法会有一个返回值，需要一个变量接收


In [24]:
age = pd.Series([10, 20, 30], index=["A", "B", "C"], dtype=np.int64)
salary = pd.Series([1000, 2000, 3000, 4000], index=["A", "B", "C", "D"])
gender = pd.Series(["Male", "Female", "Male", "Male"], index=["A", "B", "C", "D"])
data = {"Age": age, "Gender": gender, "Salary": salary}
df = pd.DataFrame(data)
print(df)

print("=" * 35)

returned_salary = df.pop("Salary")

print(df)
print("=" * 35)
print(returned_salary)

    Age  Gender  Salary
A  10.0    Male    1000
B  20.0  Female    2000
C  30.0    Male    3000
D   NaN    Male    4000
    Age  Gender
A  10.0    Male
B  20.0  Female
C  30.0    Male
D   NaN    Male
A    1000
B    2000
C    3000
D    4000
Name: Salary, dtype: int64


## 2.3 行操作


### 2.3.1 选取数据行


#### 2.3.1.1 使用 `df.loc[行标签，列标签]` 方法选取


In [25]:
age = pd.Series([10, 20, 30], index=["A", "B", "C"], dtype=np.int64)
salary = pd.Series([1000, 2000, 3000, 4000], index=["A", "B", "C", "D"])
gender = pd.Series(["Male", "Female", "Male", "Male"], index=["A", "B", "C", "D"])
data = {"Age": age, "Gender": gender, "Salary": salary}
df = pd.DataFrame(data)
print(df)

print("=" * 35)

print(df.loc["A"])
print("=" * 35)

print(df.loc[["A", "B"], ["Age", "Salary"]])

    Age  Gender  Salary
A  10.0    Male    1000
B  20.0  Female    2000
C  30.0    Male    3000
D   NaN    Male    4000
Age       10.0
Gender    Male
Salary    1000
Name: A, dtype: object
    Age  Salary
A  10.0    1000
B  20.0    2000


### 2.3.1.2 使用 `df.iloc[行索引，列索引]` 方法选取

这里的参数全部都是整型的位置索引


In [26]:
age = pd.Series([10, 20, 30], index=["A", "B", "C"], dtype=np.int64)
salary = pd.Series([1000, 2000, 3000, 4000], index=["A", "B", "C", "D"])
gender = pd.Series(["Male", "Female", "Male", "Male"], index=["A", "B", "C", "D"])
data = {"Age": age, "Gender": gender, "Salary": salary}
df = pd.DataFrame(data)
print(df)

print("=" * 35)

print(df.iloc[0:2, 0:2])

print("=" * 35)


# 取第一行，第三行，第二列，第三列
print(df.iloc[[0, 2], [1, 2]])

    Age  Gender  Salary
A  10.0    Male    1000
B  20.0  Female    2000
C  30.0    Male    3000
D   NaN    Male    4000
    Age  Gender
A  10.0    Male
B  20.0  Female
  Gender  Salary
A   Male    1000
C   Male    3000


### 2.3.2 增加数据行


使用 `pd.concat()` 方法增加数据行

`pd.concat(objs, axis = 0, join = 'outer', ignore_index = False, keys = None, levels = None, names = None, verify_integrity = False, sort = False, copy = True)`

- objs：Series、DataFrame 或者是 Series、DataFrame 组成的列表
- axis：指定轴，0 为行，1 为列
- join：指定合并方式，inner 为内连接，outer 为外连接
- ignore_index：是否忽略原始索引，默认为 False
- keys：为合并后的数据添加一个标签，用于区分数据来源
- levels：指定多层索引的级别
- names：指定多层索引的名称
- verify_integrity：检查合并后的数据是否有重复索引，如果有则抛出异常，默认为 False
- sort：对合并后的数据按照索引进行排序，默认为 False
- copy：是否复制数据，默认为 True


In [27]:
from os import name
import pandas as pd
import numpy as np

age = pd.Series([10, 20, 30], index=["A", "B", "C"], dtype=np.int64)
salary = pd.Series([1000, 2000, 3000, 4000], index=["A", "B", "C", "D"])
gender = pd.Series(["Male", "Female", "Male", "Male"], index=["A", "B", "C", "D"])
data = {"Age": age, "Gender": gender, "Salary": salary}
df = pd.DataFrame(data)
print(df)

print("=" * 35)

s_E = pd.Series({"Age": 10.0, "Gender": "Female", "Salary": 5000}, name="E")

# 在 df 中加入这一行，使用 pd.concat() 方法
df = pd.concat([df, s_E.to_frame().T])
print(df)


    Age  Gender  Salary
A  10.0    Male    1000
B  20.0  Female    2000
C  30.0    Male    3000
D   NaN    Male    4000
    Age  Gender Salary
A  10.0    Male   1000
B  20.0  Female   2000
C  30.0    Male   3000
D   NaN    Male   4000
E  10.0  Female   5000
