In [9]:
import pandas as pd
import numpy as np
df = pd.read_csv(('E:\MyProject\joyful-pandas-master\data/learn_pandas.csv'),
                 usecols = ['Grade', 'Name', 'Gender', 'Height', 'Weight'])
df.shape

(200, 5)

In [2]:
s= df['Grade'].astype('category')
s.head()

0     Freshman
1     Freshman
2       Senior
3    Sophomore
4    Sophomore
Name: Grade, dtype: category
Categories (4, object): ['Freshman', 'Junior', 'Senior', 'Sophomore']

In [4]:
#对于一个具体的分类，有两个组成部分，其一为类别的本身，它以Index 类型存储，其二为是否有序，它们都可以通过cat 的属性被访问
s.cat.categories

Index(['Freshman', 'Junior', 'Senior', 'Sophomore'], dtype='object')

In [5]:
s.cat.ordered
#是否有序

False

In [6]:
#每一个序列的类别会被赋予唯一的整数编号，它们的编号取决于cat.categories 中的顺序
s.cat.codes.head()

0    0
1    0
2    2
3    3
4    3
dtype: int8

类别的增删改查

In [43]:
#add_categories
s = s.cat.add_categories('Graduated')#增加一列新的类别
s.cat.categories

Index(['Freshman', 'Junior', 'Senior', 'Sophomore', 'Graduated'], dtype='object')

In [44]:
#删除某一个类别可以使用remove_categories ，同时所有原来序列中的该类会被设置为缺失
s = s.cat.remove_categories('Sophomore')
s.cat.categories

Index(['Freshman', 'Graduated', 'Junior', 'Senior'], dtype='object')

In [45]:
s.head()

0    Freshman
1    Freshman
2      Senior
3         NaN
4         NaN
Name: Grade, dtype: category
Categories (4, object): ['Freshman', 'Graduated', 'Junior', 'Senior']

In [46]:
#set_categories 直接设置序列的新类别，原来的类别中如果存在元素不属于新类别，那么会被设置为缺失
s=s.cat.set_categories(['Freshman','PhD'])
s.cat.categories

Index(['Freshman', 'PhD'], dtype='object')

In [47]:
s.head()

0    Freshman
1    Freshman
2         NaN
3         NaN
4         NaN
Name: Grade, dtype: category
Categories (2, object): ['Freshman', 'PhD']

In [48]:
#要删除未出现在序列中的类别，可以使用remove_unused_categories
#s = s.cat.remove_unused_categories()
s = s.cat.remove_unused_categories()
#移除了博士类
s.cat.categories

Index(['Freshman'], dtype='object')

In [51]:
#修改的操作，这可以通过rename_categories 方法完成，同时需要注意的是，这个方法会对原序列的对应值也进行相应修改
s = s.cat.rename_categories({'Freshman':'本科一年级学生'})
s.head()

0    本科一年级学生
1    本科一年级学生
2        NaN
3        NaN
4        NaN
Name: Grade, dtype: category
Categories (1, object): ['本科一年级学生']

有序分类

In [11]:
#利用reorder_categories进行有序分类，传入的参数必须是由当前序列的无需类别构成的列表，
#不能够增加新的类别，也不能缺少原来的类别，并且必须指定参数ordered=True
s = df['Grade'].astype('category')
s = s.cat.reorder_categories(['Freshman', 'Junior', 'Senior', 'Sophomore'],ordered = True)
s.head()
# df.set_index('Grade').sort_index()

0     Freshman
1     Freshman
2       Senior
3    Sophomore
4    Sophomore
Name: Grade, dtype: category
Categories (4, object): ['Freshman' < 'Junior' < 'Senior' < 'Sophomore']

In [54]:
s.cat.as_unordered().head()
#重新变为无序排列

0     Freshman
1     Freshman
2       Senior
3    Sophomore
4    Sophomore
Name: Grade, dtype: category
Categories (4, object): ['Freshman', 'Junior', 'Senior', 'Sophomore']

In [12]:
#有序类值排序
df['Grade']=df['Grade'].astype('category')
df['Grade']=df['Grade'].cat.reorder_categories(['Freshman', 'Sophomore', 'Junior', 'Senior'],ordered = True)
df.sort_values('Grade').head()

Unnamed: 0,Grade,Name,Gender,Height,Weight
0,Freshman,Gaopeng Yang,Female,158.9,46.0
105,Freshman,Qiang Shi,Female,164.5,52.0
96,Freshman,Changmei Feng,Female,163.8,56.0
88,Freshman,Xiaopeng Han,Female,164.1,53.0
81,Freshman,Yanli Zhang,Female,165.1,52.0


In [13]:
#索引排序
df.set_index('Grade').sort_index().head()

Unnamed: 0_level_0,Name,Gender,Height,Weight
Grade,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
Freshman,Gaopeng Yang,Female,158.9,46.0
Freshman,Qiang Shi,Female,164.5,52.0
Freshman,Changmei Feng,Female,163.8,56.0
Freshman,Xiaopeng Han,Female,164.1,53.0
Freshman,Yanli Zhang,Female,165.1,52.0


区间类别

In [72]:
#利用cut函数将数据分成n组
#核心参数是bin，bin=n则分成n组
#cut函数默认左开右闭，左闭右开设置为(right = False)
s = pd.Series([2,5])
pd.cut(s,bins = 2)

0    (1.997, 3.5]
1      (3.5, 5.0]
dtype: category
Categories (2, interval[float64, right]): [(1.997, 3.5] < (3.5, 5.0]]

In [73]:
pd.cut(s,bins=3,right = False)

0      [2.0, 3.0)
1    [4.0, 5.003)
dtype: category
Categories (3, interval[float64, left]): [[2.0, 3.0) < [3.0, 4.0) < [4.0, 5.003)]

In [79]:
#bins 的另一个常见用法是指定区间分割点的列表（使用np.infty 可以表示无穷大）
pd.cut(s,bins=[-np.infty,1,2.5,4,6,np.infty])
#实际有5个区间，因为只有两个元素，所有只有两行索引

0    (1.0, 2.5]
1    (4.0, 6.0]
dtype: category
Categories (5, interval[float64, right]): [(-inf, 1.0] < (1.0, 2.5] < (2.5, 4.0] < (4.0, 6.0] < (6.0, inf]]

In [32]:
df = pd.DataFrame({'Age': [25, 30, 35, 40, 45, 50, 55, 60, 65]})
labels = ['young', 'middle', 'old']
result= pd.cut(df['Age'], bins=3, labels=labels,retbins=True)

In [33]:
result[0]

0     young
1     young
2     young
3    middle
4    middle
5    middle
6       old
7       old
8       old
Name: Age, dtype: category
Categories (3, object): ['young' < 'middle' < 'old']

In [34]:
result[1]

array([24.96      , 38.33333333, 51.66666667, 65.        ])

In [25]:
#两个常用参数为labels 和retbins ，分别代表了区间的名字和是否返回分割点（默认不返回）
s = pd.Series([2,5])
res = pd.cut(s,bins=2,labels=['Small','Big'],retbins=True)

In [26]:
res[0]

0    Small
1      Big
dtype: category
Categories (2, object): ['Small' < 'Big']

In [27]:
res[1]# 该元素为返回的分割点

array([1.997, 3.5  , 5.   ])

In [1]:
import pandas as pd
import numpy as np

In [17]:
data = np.random.randint(0, 5, size=3, dtype=int)
data

array([0, 2, 3])

In [18]:
pd.qcut(data,q=4)

[(-0.001, 1.0], (1.0, 2.0], (2.5, 3.0]]
Categories (4, interval[float64, right]): [(-0.001, 1.0] < (1.0, 2.0] < (2.0, 2.5] < (2.5, 3.0]]

In [89]:
s = df.Weight
pd.qcut(s,q=4).head()
#筛选数据至某一区间，按照q=4，即4等分位

0    (33.999, 46.0]
1      (65.0, 89.0]
2      (65.0, 89.0]
3    (33.999, 46.0]
4      (65.0, 89.0]
Name: Weight, dtype: category
Categories (4, interval[float64, right]): [(33.999, 46.0] < (46.0, 51.0] < (51.0, 65.0] < (65.0, 89.0]]

In [90]:
pd.qcut(s,q=[0,0.2,0.5,0.7,1]).head()
#自定义分位数区间

0      (44.0, 51.0]
1      (56.6, 89.0]
2      (56.6, 89.0]
3    (33.999, 44.0]
4      (56.6, 89.0]
Name: Weight, dtype: category
Categories (4, interval[float64, right]): [(33.999, 44.0] < (44.0, 51.0] < (51.0, 56.6] < (56.6, 89.0]]

9.3.2 一般区间的构造

In [92]:
my_interval = pd.Interval(2,6,'right')
my_interval
#right, left, both, neither代表区间的开闭状态（左、右端点和端点）

Interval(2, 6, closed='right')

In [93]:
#mid, length, right, left, closed ，分别表示中点、长度、右端点、左端点和开闭状态
#可以使用in判断是否位于该区间内
3 in my_interval

True

In [94]:
#overlaps,判断是否含有交集
my_interval2 = pd.Interval(6,7,'left')
my_interval.overlaps(my_interval2)

True

In [96]:
#一般而言，pd.IntervalIndex 对象有四类方法生成，分别是from_breaks, from_arrays, from_tuples, interval_
#range ，它们分别应用于不同的情况
#from_breaks 的功能类似于cut 或qcut 函数，只不过后两个是通过计算得到的风格点，而前者是直接传入自定义的分割点
pd.IntervalIndex.from_breaks([4,6,9,15],closed='both')

IntervalIndex([[4, 6], [6, 9], [9, 15]], dtype='interval[int64, both]')

In [100]:
#from_arrays 是分别传入左端点和右端点的列表，适用于有交集并且知道起点和终点的情况
pd.IntervalIndex.from_arrays(left=[1,4,7,13],right=[2,5,8,14],closed='neither')

IntervalIndex([(1, 2), (4, 5), (7, 8), (13, 14)], dtype='interval[int64, neither]')

In [101]:
#from_tuples 传入的是起点和终点元组构成的列表
pd.IntervalIndex.from_tuples([(1,4),(2,5),(4,11)],closed='neither')

IntervalIndex([(1, 4), (2, 5), (4, 11)], dtype='interval[int64, neither]')

In [104]:
#一个等差的区间序列由起点、终点、区间个数和区间长度决定,interval_range 中的start, end, periods, freq 参数就对应了这四个量
#确认三个量
pd.interval_range(start=1,end=8,periods=4)

IntervalIndex([(1.0, 2.75], (2.75, 4.5], (4.5, 6.25], (6.25, 8.0]], dtype='interval[float64, right]')

In [105]:
pd.interval_range(end=6,periods=3,freq=1)

IntervalIndex([(3, 4], (4, 5], (5, 6]], dtype='interval[int64, right]')

In [106]:
#pd.IntervalIndex([...], closed=...) ，把Interval 类型的列表组成传入其中转为区间索引，那么所有的区间会被强制转为指定的closed 类型
pd.IntervalIndex([my_interval,my_interval2],closed='right')

IntervalIndex([(2, 6], (6, 7]], dtype='interval[int64, right]')

9.3.3 区间的属性与方法

In [107]:
#如果想要具体利用cut 或者qcut 的结果进行分析，那么需要先将其转为该种索引类型
id_interval = pd.IntervalIndex(pd.cut(s,3))

In [108]:
id_demo = id_interval[:5]#选出前5个展示
id_demo

IntervalIndex([(33.945, 52.333], (52.333, 70.667], (70.667, 89.0], (33.945, 52.333], (70.667, 89.0]], dtype='interval[float64, right]', name='Weight')

In [109]:
#IntervalIndex 有若干常用属性：left, right, mid, length ，分别表示左右端点、两端点均值和区间长度。
id_demo.left
#左端点数值

Index([33.945, 52.333, 70.667, 33.945, 70.667], dtype='float64')

In [110]:
id_demo.right
#右端点数值

Index([52.333, 70.667, 89.0, 52.333, 89.0], dtype='float64')

In [111]:
id_demo.mid
#两端点均值

Index([43.138999999999996, 61.5, 79.8335, 43.138999999999996, 79.8335], dtype='float64')

In [112]:
id_demo.length
#两端点区间长度

Index([18.387999999999998, 18.334000000000003, 18.333, 18.387999999999998,
       18.333],
      dtype='float64')