In [2]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

In [3]:
age = pd.Series([20, 20, 30], index=["A", "B", "C"], dtype=np.int64)
salary = pd.Series([4000, 2000, 3000, 1000], index=["A", "B", "C", "D"])
gender = pd.Series(["Male", "Female", "Male", "Male"], index=["A", "B", "C", "D"])
data = {"Age": age, "Gender": gender, "Salary": salary}
df = pd.DataFrame(data)
print(df)

    Age  Gender  Salary
A  20.0    Male    4000
B  20.0  Female    2000
C  30.0    Male    3000
D   NaN    Male    1000


# 1 属性


## 1.1 转置

`df.T`


In [4]:
print(df)
print("=" * 35)
# 转置
print(df.T)

    Age  Gender  Salary
A  20.0    Male    4000
B  20.0  Female    2000
C  30.0    Male    3000
D   NaN    Male    1000
           A       B     C     D
Age     20.0    20.0  30.0   NaN
Gender  Male  Female  Male  Male
Salary  4000    2000  3000  1000


## 1.2 返回行标签

`df.index`


In [5]:
print(df)
print("=" * 35)

print(df.index)

    Age  Gender  Salary
A  20.0    Male    4000
B  20.0  Female    2000
C  30.0    Male    3000
D   NaN    Male    1000
Index(['A', 'B', 'C', 'D'], dtype='object')


## 1.3 返回列标签

`df.columns`


In [6]:
print(df)
print("=" * 35)

print(df.columns)

    Age  Gender  Salary
A  20.0    Male    4000
B  20.0  Female    2000
C  30.0    Male    3000
D   NaN    Male    1000
Index(['Age', 'Gender', 'Salary'], dtype='object')


## 1.4 返回行标签和列标签

`df.axes`


In [7]:
print(df)
print("=" * 35)
# 输出行标签和列标签
print(df.axes)

    Age  Gender  Salary
A  20.0    Male    4000
B  20.0  Female    2000
C  30.0    Male    3000
D   NaN    Male    1000
[Index(['A', 'B', 'C', 'D'], dtype='object'), Index(['Age', 'Gender', 'Salary'], dtype='object')]


## 1.5 返回每一列的数据类型

`df.dtypes`


In [8]:
print(df)
print("=" * 35)
# 输出每一列的数据类型
print(df.dtypes)

    Age  Gender  Salary
A  20.0    Male    4000
B  20.0  Female    2000
C  30.0    Male    3000
D   NaN    Male    1000
Age       float64
Gender     object
Salary      int64
dtype: object


## 1.6 判断是否是空表

`df.empty`


In [9]:
print(df)
print("=" * 35)

print(df.empty)

    Age  Gender  Salary
A  20.0    Male    4000
B  20.0  Female    2000
C  30.0    Male    3000
D   NaN    Male    1000
False


## 1.7 获取 DateFrame 的形状

`df.shape`


In [10]:
print(df)
print("=" * 35)

print(df.shape)

    Age  Gender  Salary
A  20.0    Male    4000
B  20.0  Female    2000
C  30.0    Male    3000
D   NaN    Male    1000
(4, 3)


## 1.8 获取 DataFrame 的元素个数

`df.size`


In [11]:
print(df)
print("=" * 35)

print(df.size)

    Age  Gender  Salary
A  20.0    Male    4000
B  20.0  Female    2000
C  30.0    Male    3000
D   NaN    Male    1000
12


## 1.9 获取 DataFrame 的各个元素值

`df.values`


In [12]:
print(df)
print("=" * 35)

print(df.values)

    Age  Gender  Salary
A  20.0    Male    4000
B  20.0  Female    2000
C  30.0    Male    3000
D   NaN    Male    1000
[[20.0 'Male' 4000]
 [20.0 'Female' 2000]
 [30.0 'Male' 3000]
 [nan 'Male' 1000]]


## 1.10 获取 DataFrame 的总体信息

`df.info()`


In [13]:
print(df)
print("=" * 35)

print(df.info())

    Age  Gender  Salary
A  20.0    Male    4000
B  20.0  Female    2000
C  30.0    Male    3000
D   NaN    Male    1000
<class 'pandas.core.frame.DataFrame'>
Index: 4 entries, A to D
Data columns (total 3 columns):
 #   Column  Non-Null Count  Dtype  
---  ------  --------------  -----  
 0   Age     3 non-null      float64
 1   Gender  4 non-null      object 
 2   Salary  4 non-null      int64  
dtypes: float64(1), int64(1), object(1)
memory usage: 300.0+ bytes
None


# 2 方法


## 2.1 查看前几行数据

`df.head(n)`


In [14]:
print(df)
print("=" * 35)

print(df.head(2))

    Age  Gender  Salary
A  20.0    Male    4000
B  20.0  Female    2000
C  30.0    Male    3000
D   NaN    Male    1000
    Age  Gender  Salary
A  20.0    Male    4000
B  20.0  Female    2000


## 2.2 查看后几行数据

`df.tail(n)`


In [15]:
print(df)
print("=" * 35)

print(df.tail(2))

    Age  Gender  Salary
A  20.0    Male    4000
B  20.0  Female    2000
C  30.0    Male    3000
D   NaN    Male    1000
    Age Gender  Salary
C  30.0   Male    3000
D   NaN   Male    1000


## 2.3 修改行列名

`df.rename(columns={'old_name': 'new_ name'})`


In [16]:
print(df)
print("=" * 35)

# 修改列名，将 Age 修改为年龄
df1 = df.copy()
df1.rename(columns={"Age": "年龄"}, inplace=True)
print(df1)

# 修改行名，将 A 修改为 No1
df2 = df.copy()
df2.rename(index={"A": "No1"}, inplace=True)
print(df2)

# 修改全部列名，统一改成中文
df3 = df.copy()
df3.columns = ["年龄", "性别", "薪资"]
print(df3)

    Age  Gender  Salary
A  20.0    Male    4000
B  20.0  Female    2000
C  30.0    Male    3000
D   NaN    Male    1000
     年龄  Gender  Salary
A  20.0    Male    4000
B  20.0  Female    2000
C  30.0    Male    3000
D   NaN    Male    1000
      Age  Gender  Salary
No1  20.0    Male    4000
B    20.0  Female    2000
C    30.0    Male    3000
D     NaN    Male    1000
     年龄      性别    薪资
A  20.0    Male  4000
B  20.0  Female  2000
C  30.0    Male  3000
D   NaN    Male  1000


## 2.4 根据标签名称重排行列标签

`df.sort_index()`

- `axis`: 指定排序的轴，`0`表示按行标签排序，`1`表示按列标签排序。
- `ascending`: 指定排序的顺序，`True`表示升序，`False`表示降序。
- `inplace`: 指定是否在原数据上进行排序，`True`表示在原数据上进行排序，`False`表示返回排序后的副本。
- `na_position`: 仅对行标签重排有作用，指定缺失值的位置，`'last'`表示将缺失值放在排序结果的最后，`'first'`表示将缺失值放在排序结果的最前面。


In [17]:
print(df)
print("=" * 35)

# 重排列标签，na_position 并不起作用
df4 = df.copy()
df4.loc[["B"], "Gender"] = np.nan
df4.sort_index(axis=1, ascending=False, inplace=True, na_position="first")
print(df4)
print("=" * 35)

df4.sort_index(axis=0, ascending=False, inplace=True, na_position="first")
print(df4)

    Age  Gender  Salary
A  20.0    Male    4000
B  20.0  Female    2000
C  30.0    Male    3000
D   NaN    Male    1000
   Salary Gender   Age
A    4000   Male  20.0
B    2000    NaN  20.0
C    3000   Male  30.0
D    1000   Male   NaN
   Salary Gender   Age
D    1000   Male   NaN
C    3000   Male  30.0
B    2000    NaN  20.0
A    4000   Male  20.0


## 2.5 根据某列数据重排行

`df.sort_values(by='column_name', ascending=True, inplace=False, na_position='last')`


In [18]:
print(df)
print("=" * 35)

# 按照年龄升序排列
df5 = df.copy()
df5.sort_values(by="Age", ascending=False, inplace=True)
print(df5)
print("=" * 35)

# 先按照年龄升序，再薪资降序排列，NaN 值放在最前面
df6 = df.copy()
df6.sort_values(
    by=["Age", "Salary"], ascending=[True, False], inplace=True, na_position="first"
)
print(df6)

    Age  Gender  Salary
A  20.0    Male    4000
B  20.0  Female    2000
C  30.0    Male    3000
D   NaN    Male    1000
    Age  Gender  Salary
C  30.0    Male    3000
A  20.0    Male    4000
B  20.0  Female    2000
D   NaN    Male    1000
    Age  Gender  Salary
D   NaN    Male    1000
A  20.0    Male    4000
B  20.0  Female    2000
C  30.0    Male    3000


## 2.6 将某一列设置为 index

`df.set_index(columns=[], drop=True, append=False, inplace=False, verify_integrity=False)`

- `columns`: 指定要设置为 index 的列名。若有多列，则以列表形式传入。
- `drop`: 指定是否将设置为 index 的列从 DataFrame 中删除。
- `append`: 指定是否将设置为 index 的列追加到原有的 index 上。
- `inplace`: 指定是否在原数据上进行操作。
- `verify_integrity`: 指定是否检查新设置的 index 是否有重复值。


In [20]:
df = pd.DataFrame(
    {"month": [1, 4, 7, 10], "year": [2012, 2014, 2013, 2014], "sale": [55, 40, 84, 31]}
)

print(df)

   month  year  sale
0      1  2012    55
1      4  2014    40
2      7  2013    84
3     10  2014    31


In [28]:
# 将 month 列设置为行索引
new_df = df.set_index("month", append=True, inplace=False)
print(new_df)
print(new_df.loc[(1, 4)])

         year  sale
  month            
0 1      2012    55
1 4      2014    40
2 7      2013    84
3 10     2014    31
year    2014
sale      40
Name: (1, 4), dtype: int64
