In [1]:
import pandas as pd

In [2]:
pd.read_excel("xl/course_participants.xlsx")

Unnamed: 0,user_id,name,age,country,score,continent
0,1001,Mark,55,Italy,4.5,Europe
1,1000,John,33,USA,6.7,America
2,1002,Tim,41,USA,3.9,America
3,1003,Jenny,12,Germany,9.0,Europe


In [3]:
data=[["Mark", 55, "Italy", 4.5, "Europe"],
      ["John", 33, "USA", 6.7, "America"],
      ["Tim", 41, "USA", 3.9, "America"],
      ["Jenny", 12, "Germany", 9.0, "Europe"]
     ]
df = pd.DataFrame(data=data, columns=["name", "age", "country", "score", "continent"], 
                  index=[1001, 1000, 1002, 1003])
df

Unnamed: 0,name,age,country,score,continent
1001,Mark,55,Italy,4.5,Europe
1000,John,33,USA,6.7,America
1002,Tim,41,USA,3.9,America
1003,Jenny,12,Germany,9.0,Europe


In [4]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 4 entries, 1001 to 1003
Data columns (total 5 columns):
 #   Column     Non-Null Count  Dtype  
---  ------     --------------  -----  
 0   name       4 non-null      object 
 1   age        4 non-null      int64  
 2   country    4 non-null      object 
 3   score      4 non-null      float64
 4   continent  4 non-null      object 
dtypes: float64(1), int64(1), object(3)
memory usage: 192.0+ bytes


In [5]:
df.index

Index([1001, 1000, 1002, 1003], dtype='int64')

In [6]:
df.index.name = "user_id"
df

Unnamed: 0_level_0,name,age,country,score,continent
user_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
1001,Mark,55,Italy,4.5,Europe
1000,John,33,USA,6.7,America
1002,Tim,41,USA,3.9,America
1003,Jenny,12,Germany,9.0,Europe


In [7]:
# reset_index会将索引还原为普通列同时用默认索引替换当前索引
# 最终结果就和刚从Excel文件中得到的DataFrame一样
df.reset_index()

Unnamed: 0,user_id,name,age,country,score,continent
0,1001,Mark,55,Italy,4.5,Europe
1,1000,John,33,USA,6.7,America
2,1002,Tim,41,USA,3.9,America
3,1003,Jenny,12,Germany,9.0,Europe


In [8]:
# reset_index会将user_id还原成普通列
# set_index会将“name”列设置为索引
df.reset_index().set_index("name")
# 链式调用法，即：reset_index()会返回一个DataFrame，可以直接在这个DataFrame上调用另一个方法而无需写出中间值

Unnamed: 0_level_0,user_id,age,country,score,continent
name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
Mark,1001,55,Italy,4.5,Europe
John,1000,33,USA,6.7,America
Tim,1002,41,USA,3.9,America
Jenny,1003,12,Germany,9.0,Europe


In [9]:
# reindex()会接管所有能够匹配新索引的行，未匹配到的索引会引入含有空值(NaN)的行
df.reindex([999, 1000, 1001, 1004])

Unnamed: 0_level_0,name,age,country,score,continent
user_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
999,,,,,
1000,John,33.0,USA,6.7,America
1001,Mark,55.0,Italy,4.5,Europe
1004,,,,,


In [10]:
# 使用sort_index()按索引排序
df.sort_index()

Unnamed: 0_level_0,name,age,country,score,continent
user_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
1000,John,33,USA,6.7,America
1001,Mark,55,Italy,4.5,Europe
1002,Tim,41,USA,3.9,America
1003,Jenny,12,Germany,9.0,Europe


In [11]:
# 按一列或多列进行排序，使用sort_values
# 该例先按"continent"进行排序，后按"age"排序（只用一列排序即一个参数）
df.sort_values(["continent", "age"])

Unnamed: 0_level_0,name,age,country,score,continent
user_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
1000,John,33,USA,6.7,America
1002,Tim,41,USA,3.9,America
1003,Jenny,12,Germany,9.0,Europe
1001,Mark,55,Italy,4.5,Europe


In [12]:
df.columns

Index(['name', 'age', 'country', 'score', 'continent'], dtype='object')

In [13]:
# 列命名
df.columns.name = "properties"
df

properties,name,age,country,score,continent
user_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
1001,Mark,55,Italy,4.5,Europe
1000,John,33,USA,6.7,America
1002,Tim,41,USA,3.9,America
1003,Jenny,12,Germany,9.0,Europe


In [14]:
# 重命名rename()
df.rename(columns={"name": "First name","age": "Age"})

properties,First name,Age,country,score,continent
user_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
1001,Mark,55,Italy,4.5,Europe
1000,John,33,USA,6.7,America
1002,Tim,41,USA,3.9,America
1003,Jenny,12,Germany,9.0,Europe


In [15]:
# 删除某些列drop()
df.drop(columns=["name", "country"], 
        index=[1000, 1003])

properties,age,score,continent
user_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
1001,55,4.5,Europe
1002,41,3.9,America


In [16]:
# 通过转置（transpose）DataFrame可以将行和列对调
# df.transpose()的简写
df.T

user_id,1001,1000,1002,1003
properties,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
name,Mark,John,Tim,Jenny
age,55,33,41,12
country,Italy,USA,USA,Germany
score,4.5,6.7,3.9,9.0
continent,Europe,America,America,Europe


In [17]:
# 更改DataFrame列的顺序（可使用前面用在索引上的reindex()方法，也可以使用loc[]直接给出所需要的列顺序通常会更直观）
df.loc[:, ["continent", "country", "name", "age", "score"]]

properties,continent,country,name,age,score
user_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
1001,Europe,Italy,Mark,55,4.5
1000,America,USA,John,33,6.7
1002,America,USA,Tim,41,3.9
1003,Europe,Germany,Jenny,12,9.0


In [18]:
# 使用标签选取数据loc[]
# 行和列都是用标量来选择，返回值也是标量
df.loc[1001, "name"]

'Mark'

In [19]:
# 只用标量选择行或列，返回值是Series
df.loc[[1001, 1002], "age"]

user_id
1001    55
1002    41
Name: age, dtype: int64

In [20]:
# 选取多行或多列，返回值是DataFrame
df.loc[:1002, ["name", "country"]]

properties,name,country
user_id,Unnamed: 1_level_1,Unnamed: 2_level_1
1001,Mark,Italy
1000,John,USA
1002,Tim,USA


In [21]:
df

properties,name,age,country,score,continent
user_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
1001,Mark,55,Italy,4.5,Europe
1000,John,33,USA,6.7,America
1002,Tim,41,USA,3.9,America
1003,Jenny,12,Germany,9.0,Europe


In [36]:
# 通过位置选取数据iloc[]
# 返回标量
df.iloc[0, 0]

'Mark'

In [37]:
# 返回Series
df.iloc[[0, 2], 1]

user_id
1001    55
1002    41
Name: age, dtype: int64

In [38]:
# 返回DataFrame
df.iloc[:3, [0, 2]]

properties,name,country
user_id,Unnamed: 1_level_1,Unnamed: 2_level_1
1001,Mark,Italy
1000,John,USA
1002,Tim,USA
