In [2]:
import pandas as pd
import numpy as np

## 第二章 Pandas基础

In [3]:
# 在读取 txt 文件时，经常遇到分隔符非空格的情况，read_table 有一个分割参数 sep ，
# 它使得用户可以自定 义分割符号，进行 txt 数据的读取
table1 = pd.read_table('ch2/my_table_special_sep.txt')   # 不使用sep时
print(table1)
table2 = pd.read_table('ch2/my_table_special_sep.txt', sep="\|\|\|\|", engine='python')  
# 注意使用转依字符，||||分隔，同时指定为python引擎，参数 sep 中使用的是正则表达式
print(table2)

              col1 |||| col2
0  TS |||| This is an apple.
1    GQ |||| My name is Bob.
2         WT |||| Well done!
  col1                 col2
0   TS    This is an apple.
1   GQ      My name is Bob.
2   WT           Well done!


### 基本函数操作

In [4]:
df = pd.read_csv('learn_pandas.csv')
df.info()      # dataframe简述

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 200 entries, 0 to 199
Data columns (total 10 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   School       200 non-null    object 
 1   Grade        200 non-null    object 
 2   Name         200 non-null    object 
 3   Gender       200 non-null    object 
 4   Height       183 non-null    float64
 5   Weight       189 non-null    float64
 6   Transfer     188 non-null    object 
 7   Test_Number  200 non-null    int64  
 8   Test_Date    200 non-null    object 
 9   Time_Record  200 non-null    object 
dtypes: float64(2), int64(1), object(7)
memory usage: 15.8+ KB


In [5]:
df.describe()     # 基本统计特征

Unnamed: 0,Height,Weight,Test_Number
count,183.0,189.0,200.0
mean,163.218033,55.015873,1.645
std,8.608879,12.824294,0.722207
min,145.4,34.0,1.0
25%,157.15,46.0,1.0
50%,161.9,51.0,1.5
75%,167.5,65.0,2.0
max,193.9,89.0,3.0


In [6]:
print(df['School'].unique())     # 数据去重
print(df['School'].nunique())    # 去重后的个数统计
print(df['School'].value_counts())    # 统计每个值出现的次数

['A' 'B' 'C' 'D']
4
D    69
A    57
C    40
B    34
Name: School, dtype: int64


In [7]:
# 如果想要观察多个列组合的唯一值，可以使用 drop_duplicates
# 其中的关键参数是 keep ，默认值 first 表示每个组合保留第一次出现的所在行，
# last 表示保留最后一次出现的所在行，False 表示把所有重复组合所在的行剔除
df_demo = df[['Gender','Transfer','Name']]
df_demo.drop_duplicates(['Gender', 'Transfer'], keep='last')

Unnamed: 0,Gender,Transfer,Name
147,Male,,Juan You
150,Male,Y,Chengpeng You
169,Female,Y,Chengquan Qin
194,Female,,Yanmei Qian
197,Female,N,Chengqiang Chu
199,Male,N,Chunpeng Lv


In [8]:
## 替换函数
# 一般而言，替换操作是针对某一个列进行的，因此下面的例子都以Series举例。
# pandas中的替换函数可以归纳为三类：映射替换、逻辑替换、数值替换
df['Gender'].replace({'Female': 0, 'Male':1}).head()   # 女性为0，男性为1

0    0
1    1
2    1
3    0
4    1
Name: Gender, dtype: int64

In [9]:
# 逻辑替换包括了 where 和 mask ，这两个函数是完全对称的：
# where 函数在传入条件为 False 的对应行进行替换，而mask在传入条件为True的对应行进行替换，
# 当不指定替换值时，替换为缺失值
s = pd.Series([-1, 1.2345, 100, -50])
s.where(s<0,100)

0     -1.0
1    100.0
2    100.0
3    -50.0
dtype: float64

In [10]:
# s.round(2)   # 保留两位小数
# s.abs()      # 取绝对值
print(s.clip(0,2))    # 按上下界截断，超过下界的置为0，超过上界的置为2
print(s.clip(0,2).replace([0,2], [1,1]))    # 我还可以自定义超过上下界的赋值为多少

0    0.0000
1    1.2345
2    2.0000
3    0.0000
dtype: float64
0    1.0000
1    1.2345
2    1.0000
3    1.0000
dtype: float64


In [41]:
## 排序函数
## 排序共有两种方式，其一为值排序，其二为索引排序，对应的函数是 sort_values 和 sort_index 
df_demo =  df[['Grade', 'Name', 'Height', 'Weight']].set_index(['Grade','Name']) # 把这俩列设为索引
df_demo

Unnamed: 0_level_0,Unnamed: 1_level_0,Height,Weight
Grade,Name,Unnamed: 2_level_1,Unnamed: 3_level_1
Freshman,Gaopeng Yang,158.9,46.0
Freshman,Changqiang You,166.5,70.0
Senior,Mei Sun,188.9,89.0
Sophomore,Xiaojuan Sun,,41.0
Sophomore,Gaojuan You,174.0,74.0
...,...,...,...
Junior,Xiaojuan Sun,153.9,46.0
Senior,Li Zhao,160.9,50.0
Senior,Chengqiang Chu,153.9,45.0
Senior,Chengmei Shen,175.3,71.0


In [43]:
# 对身高进行排序，默认是升序排列
df_demo.sort_values('Height').head(5)

Unnamed: 0_level_0,Unnamed: 1_level_0,Height,Weight
Grade,Name,Unnamed: 2_level_1,Unnamed: 3_level_1
Junior,Xiaoli Chu,145.4,34.0
Senior,Gaomei Lv,147.3,34.0
Sophomore,Peng Han,147.8,34.0
Senior,Changli Lv,148.7,41.0
Sophomore,Changjuan You,150.5,40.0


In [44]:
# 在排序中，经常遇到多列排序的问题，比如在体重相同的情况下，
# 对身高进行排序，并且保持身高降序排列， 体重升序排列（这里是先排体重然后再排身高）
df_demo.sort_values(['Weight','Height'],ascending=[True,False]).head()

Unnamed: 0_level_0,Unnamed: 1_level_0,Height,Weight
Grade,Name,Unnamed: 2_level_1,Unnamed: 3_level_1
Sophomore,Peng Han,147.8,34.0
Senior,Gaomei Lv,147.3,34.0
Junior,Xiaoli Chu,145.4,34.0
Sophomore,Qiang Zhou,150.5,36.0
Freshman,Yanqiang Xu,152.4,38.0
