In [83]:
import numpy as np
import pandas as pd
pd.__version__

'1.0.3'

In [98]:
# 构造Series
s = pd.Series(["军训", "思修", "史纲", "马原", "毛概"])
# s = pd.Series(["军训", "思修", "史纲", "马原", "毛概"],name = "subject")
s


0    军训
1    思修
2    史纲
3    马原
4    毛概
dtype: object

In [96]:
# 把Series组合成DataFrame
s2=pd.Series([88, 89, 90, 91, 92],name = 'score')
s2.to_frame()


Unnamed: 0,score
0,88
1,89
2,90
3,91
4,92


In [118]:
# 从“等长列表组成的字典”构造DataFrame
data = {
    "subject": ["军训", "思修", "史纲", "马原", "毛概"],
    "绩点": [88, 89, 90, 91, 92],
    "year": [2018, 2019, 2019, 2020, 2020],
}
df = pd.DataFrame(data)
df

Unnamed: 0,subject,绩点,year
0,军训,88,2018
1,思修,89,2019
2,史纲,90,2019
3,马原,91,2020
4,毛概,92,2020


In [119]:
# 修改列名
df.rename(columns={'绩点':'score'},inplace=True) 
df

Unnamed: 0,subject,score,year
0,军训,88,2018
1,思修,89,2019
2,史纲,90,2019
3,马原,91,2020
4,毛概,92,2020


In [120]:
# 修改值
df.loc[3,"score"]=100
df

Unnamed: 0,subject,score,year
0,军训,88,2018
1,思修,89,2019
2,史纲,90,2019
3,马原,100,2020
4,毛概,92,2020


In [121]:
# 修改索引
df.index = [12090043,10610183,10610193,10610204,10610224]
df

Unnamed: 0,subject,score,year
12090043,军训,88,2018
10610183,思修,89,2019
10610193,史纲,90,2019
10610204,马原,100,2020
10610224,毛概,92,2020


In [122]:
# 没有3这个索引值，所以会新建一行新的信息
# 课程名和year没有赋值，显示为NaN，即缺失值
df.loc[3,"score"]=100 
df

Unnamed: 0,subject,score,year
12090043,军训,88.0,2018.0
10610183,思修,89.0,2019.0
10610193,史纲,90.0,2019.0
10610204,马原,100.0,2020.0
10610224,毛概,92.0,2020.0
3,,100.0,


In [123]:
# 快速查看数据的统计摘要（更精细的需要自己做）
df.describe()

Unnamed: 0,score,year
count,6.0,5.0
mean,93.166667,2019.2
std,5.455884,0.83666
min,88.0,2018.0
25%,89.25,2019.0
50%,91.0,2019.0
75%,98.0,2020.0
max,100.0,2020.0


In [124]:
df = df.drop(3)
df['credit']=[3,3,4,4,4]
df

Unnamed: 0,subject,score,year,credit
12090043,军训,88.0,2018.0,3
10610183,思修,89.0,2019.0,3
10610193,史纲,90.0,2019.0,4
10610204,马原,100.0,2020.0,4
10610224,毛概,92.0,2020.0,4


In [125]:
# 以90分为阈值转换绩点
# 利用了lambda表达式
df["point"] = df['score'].apply(lambda x: 4.0 if x >=90 else 3.6)
df

Unnamed: 0,subject,score,year,credit,point
12090043,军训,88.0,2018.0,3,3.6
10610183,思修,89.0,2019.0,3,3.6
10610193,史纲,90.0,2019.0,4,4.0
10610204,马原,100.0,2020.0,4,4.0
10610224,毛概,92.0,2020.0,4,4.0


In [129]:
# 排序
df = df.sort_values(by="score",ascending=False)
df

Unnamed: 0,subject,score,year,credit,point,sum
10610204,马原,100.0,2020.0,4,4.0,16.0
10610224,毛概,92.0,2020.0,4,4.0,16.0
10610193,史纲,90.0,2019.0,4,4.0,16.0
10610183,思修,89.0,2019.0,3,3.6,10.8
12090043,军训,88.0,2018.0,3,3.6,10.8


In [130]:
# 加权平均，计算GPA
df["sum"] = df["credit"] * df["point"]
df

Unnamed: 0,subject,score,year,credit,point,sum
10610204,马原,100.0,2020.0,4,4.0,16.0
10610224,毛概,92.0,2020.0,4,4.0,16.0
10610193,史纲,90.0,2019.0,4,4.0,16.0
10610183,思修,89.0,2019.0,3,3.6,10.8
12090043,军训,88.0,2018.0,3,3.6,10.8


In [131]:
GPA = df["sum"].sum() / df["credit"].sum()
GPA

3.8666666666666663

In [134]:
# 分组与聚合
b = df.groupby('year')['score','point'].agg(np.mean)
b

Unnamed: 0_level_0,score,point
year,Unnamed: 1_level_1,Unnamed: 2_level_1
2018.0,88.0,3.6
2019.0,89.5,3.8
2020.0,96.0,4.0
