## Converting continous values into categorical values

In [1]:
# binning or bucketizing for continuous variables
ages = [20, 22, 25, 27, 21, 23, 37, 31, 61, 45, 41, 32]

In [2]:
import numpy as np
import pandas as pd

In [3]:
bins = [18, 25,35,60,80]

In [4]:
categories =  pd.cut(ages, bins)

In [5]:
categories

[(18, 25], (18, 25], (18, 25], (25, 35], (18, 25], ..., (25, 35], (60, 80], (35, 60], (35, 60], (25, 35]]
Length: 12
Categories (4, interval[int64]): [(18, 25] < (25, 35] < (35, 60] < (60, 80]]

In [6]:
categories.codes

array([0, 0, 0, 1, 0, 0, 2, 1, 3, 2, 2, 1], dtype=int8)

In [7]:
categories.categories

IntervalIndex([(18, 25], (25, 35], (35, 60], (60, 80]],
              closed='right',
              dtype='interval[int64]')

In [8]:
pd.value_counts(categories)

(18, 25]    5
(35, 60]    3
(25, 35]    3
(60, 80]    1
dtype: int64

In [9]:
type(categories)

pandas.core.arrays.categorical.Categorical

In [11]:
pd.cut(ages, [10,20,30,40,50,60,70,80,90], right=True )

[(10, 20], (20, 30], (20, 30], (20, 30], (20, 30], ..., (30, 40], (60, 70], (40, 50], (40, 50], (30, 40]]
Length: 12
Categories (8, interval[int64]): [(10, 20] < (20, 30] < (30, 40] < (40, 50] < (50, 60] < (60, 70] < (70, 80] < (80, 90]]

In [12]:
pd.cut(ages, [10,20,30,40,50,60,70,80,90], right=False )

[[20, 30), [20, 30), [20, 30), [20, 30), [20, 30), ..., [30, 40), [60, 70), [40, 50), [40, 50), [30, 40)]
Length: 12
Categories (8, interval[int64]): [[10, 20) < [20, 30) < [30, 40) < [40, 50) < [50, 60) < [60, 70) < [70, 80) < [80, 90)]

In [13]:
group_names = ['Youth', 'YoungAdult', 'MiddleAged', 'Senior']

In [16]:
bins = [18, 25,35,60,80]
pd.cut(ages, bins, labels=group_names)

[Youth, Youth, Youth, YoungAdult, Youth, ..., YoungAdult, Senior, MiddleAged, MiddleAged, YoungAdult]
Length: 12
Categories (4, object): [Youth < YoungAdult < MiddleAged < Senior]

In [17]:
pd.value_counts(pd.cut(ages, bins, labels=group_names))

Youth         5
MiddleAged    3
YoungAdult    3
Senior        1
dtype: int64

In [18]:
data = np.random.rand(20)
pd.cut(data, 4, precision=2)


[(0.27, 0.49], (0.49, 0.72], (0.27, 0.49], (0.49, 0.72], (0.49, 0.72], ..., (0.27, 0.49], (0.49, 0.72], (0.041, 0.27], (0.72, 0.94], (0.49, 0.72]]
Length: 20
Categories (4, interval[float64]): [(0.041, 0.27] < (0.27, 0.49] < (0.49, 0.72] < (0.72, 0.94]]

In [19]:
pd.value_counts(pd.cut(data, 4, precision=2))

(0.49, 0.72]     6
(0.041, 0.27]    6
(0.72, 0.94]     4
(0.27, 0.49]     4
dtype: int64

In [20]:
#Cut into quartiles
data = np.random.randn(1000)
pd.qcut(data, 4)


[(-0.0531, 0.617], (-0.0531, 0.617], (-3.3, -0.749], (0.617, 3.692], (-0.0531, 0.617], ..., (-3.3, -0.749], (0.617, 3.692], (0.617, 3.692], (0.617, 3.692], (-3.3, -0.749]]
Length: 1000
Categories (4, interval[float64]): [(-3.3, -0.749] < (-0.749, -0.0531] < (-0.0531, 0.617] < (0.617, 3.692]]

In [21]:
pd.value_counts(pd.qcut(data, 4))

(0.617, 3.692]       250
(-0.0531, 0.617]     250
(-0.749, -0.0531]    250
(-3.3, -0.749]       250
dtype: int64

In [22]:
pd.qcut(data, [0,0.1,0.5,0.9,1.])

[(-0.0531, 1.169], (-0.0531, 1.169], (-3.3, -1.409], (-0.0531, 1.169], (-0.0531, 1.169], ..., (-3.3, -1.409], (-0.0531, 1.169], (-0.0531, 1.169], (-0.0531, 1.169], (-1.409, -0.0531]]
Length: 1000
Categories (4, interval[float64]): [(-3.3, -1.409] < (-1.409, -0.0531] < (-0.0531, 1.169] < (1.169, 3.692]]

In [23]:
pd.value_counts(pd.qcut(data, [0,0.1,0.5,0.9,1.]))

(-0.0531, 1.169]     400
(-1.409, -0.0531]    400
(1.169, 3.692]       100
(-3.3, -1.409]       100
dtype: int64

In [24]:
pd.qcut(data, [0,0.1,0.5,0.7,0.9,1.])

[(-0.0531, 0.465], (0.465, 1.169], (-3.3, -1.409], (0.465, 1.169], (0.465, 1.169], ..., (-3.3, -1.409], (0.465, 1.169], (0.465, 1.169], (0.465, 1.169], (-1.409, -0.0531]]
Length: 1000
Categories (5, interval[float64]): [(-3.3, -1.409] < (-1.409, -0.0531] < (-0.0531, 0.465] < (0.465, 1.169] < (1.169, 3.692]]

In [27]:
pd.value_counts(pd.qcut(data, [0, 0.1, 0.3, 0.4, 0.5, 0.9, 1.]))

(-0.0531, 1.169]     400
(-1.409, -0.586]     200
(1.169, 3.692]       100
(-0.299, -0.0531]    100
(-0.586, -0.299]     100
(-3.3, -1.409]       100
dtype: int64

In [28]:
#filter out the outliers

In [32]:
data = pd.DataFrame(np.random.randn(1000,7))
data.describe()

Unnamed: 0,0,1,2,3,4,5,6
count,1000.0,1000.0,1000.0,1000.0,1000.0,1000.0,1000.0
mean,-0.031535,-0.019797,-0.009452,0.023961,0.075583,0.009913,0.024643
std,0.988387,0.997779,1.012735,1.01831,0.990005,0.967625,1.00943
min,-3.0321,-3.63401,-3.780945,-3.344809,-3.50814,-2.536288,-2.890089
25%,-0.683211,-0.697959,-0.676132,-0.681857,-0.586835,-0.610543,-0.655051
50%,-0.019131,-0.018273,-0.003047,0.023959,0.081323,-0.006828,0.00085
75%,0.623937,0.692204,0.650384,0.688938,0.730284,0.639051,0.690289
max,4.678413,2.917241,3.247006,4.001007,3.819563,3.269873,2.905443


In [33]:
data.head()

Unnamed: 0,0,1,2,3,4,5,6
0,0.573889,0.983214,0.381537,0.991621,0.942102,-1.421735,-0.555658
1,0.720607,1.207138,0.568765,1.172546,-0.294025,-0.375523,-0.583912
2,-0.36311,0.496066,0.578565,-0.902002,0.336164,0.835852,-1.982204
3,-1.122552,-0.063458,-2.294386,-0.701909,2.209166,0.312567,0.865364
4,-1.28058,-1.019189,0.373381,0.74999,0.113949,-1.155285,-0.139047


In [34]:
col = data[2]
col

0      0.381537
1      0.568765
2      0.578565
3     -2.294386
4      0.373381
         ...   
995    1.371744
996   -0.895554
997   -0.072628
998    0.769871
999    1.223932
Name: 2, Length: 1000, dtype: float64

In [35]:
col[np.abs(col)>3]

401   -3.780945
752    3.010100
905   -3.490724
955    3.247006
Name: 2, dtype: float64

In [38]:
data[(np.abs(data)>3).any(1)]

Unnamed: 0,0,1,2,3,4,5,6
58,-1.300242,0.813957,-0.280482,-2.13318,3.819563,0.517857,-1.823093
59,0.476237,-0.032153,1.360485,-0.367167,-0.981897,3.269873,0.026641
61,-3.026466,0.759786,-0.202169,0.541964,0.831817,0.590962,-0.929066
266,4.678413,-0.073609,-0.143125,0.683036,1.048186,0.638133,0.610921
317,-1.115122,-1.015318,-1.02278,-3.344809,0.477796,-1.04305,-0.815656
401,-0.135885,-0.768186,-3.780945,-0.379935,0.769906,0.142843,-0.428254
548,-1.25014,-1.506301,-0.392548,3.389074,0.475476,-1.635492,-0.14904
593,-2.321227,0.029267,1.601639,4.001007,-1.524433,1.004025,-0.571713
717,-0.372405,0.546739,-1.422648,1.582601,3.210873,0.264721,0.792927
752,-0.051312,0.707953,3.0101,-0.063165,-0.068358,-0.003327,0.185698


In [43]:
data[(np.abs(data)>3).any(1)]

Unnamed: 0,0,1,2,3,4,5,6
58,-1.300242,0.813957,-0.280482,-2.13318,3.819563,0.517857,-1.823093
59,0.476237,-0.032153,1.360485,-0.367167,-0.981897,3.269873,0.026641
61,-3.026466,0.759786,-0.202169,0.541964,0.831817,0.590962,-0.929066
266,4.678413,-0.073609,-0.143125,0.683036,1.048186,0.638133,0.610921
317,-1.115122,-1.015318,-1.02278,-3.344809,0.477796,-1.04305,-0.815656
401,-0.135885,-0.768186,-3.780945,-0.379935,0.769906,0.142843,-0.428254
548,-1.25014,-1.506301,-0.392548,3.389074,0.475476,-1.635492,-0.14904
593,-2.321227,0.029267,1.601639,4.001007,-1.524433,1.004025,-0.571713
717,-0.372405,0.546739,-1.422648,1.582601,3.210873,0.264721,0.792927
752,-0.051312,0.707953,3.0101,-0.063165,-0.068358,-0.003327,0.185698


In [44]:
data[np.abs(data) > 3] = np.sign(data) * 3

In [45]:
data

Unnamed: 0,0,1,2,3,4,5,6
0,0.573889,0.983214,0.381537,0.991621,0.942102,-1.421735,-0.555658
1,0.720607,1.207138,0.568765,1.172546,-0.294025,-0.375523,-0.583912
2,-0.363110,0.496066,0.578565,-0.902002,0.336164,0.835852,-1.982204
3,-1.122552,-0.063458,-2.294386,-0.701909,2.209166,0.312567,0.865364
4,-1.280580,-1.019189,0.373381,0.749990,0.113949,-1.155285,-0.139047
...,...,...,...,...,...,...,...
995,-1.289052,0.890543,1.371744,0.116542,0.559412,-0.398512,0.061693
996,-2.162296,-0.757617,-0.895554,0.117218,-0.279755,-0.891445,-0.241723
997,0.875297,-0.285745,-0.072628,1.016512,-3.000000,-0.329993,0.472298
998,-0.431485,-1.582240,0.769871,-0.754136,1.479410,-2.114966,-0.803198


In [47]:
data[(np.abs(data)==3).any(1)]

Unnamed: 0,0,1,2,3,4,5,6
58,-1.300242,0.813957,-0.280482,-2.13318,3.0,0.517857,-1.823093
59,0.476237,-0.032153,1.360485,-0.367167,-0.981897,3.0,0.026641
61,-3.0,0.759786,-0.202169,0.541964,0.831817,0.590962,-0.929066
266,3.0,-0.073609,-0.143125,0.683036,1.048186,0.638133,0.610921
317,-1.115122,-1.015318,-1.02278,-3.0,0.477796,-1.04305,-0.815656
401,-0.135885,-0.768186,-3.0,-0.379935,0.769906,0.142843,-0.428254
548,-1.25014,-1.506301,-0.392548,3.0,0.475476,-1.635492,-0.14904
593,-2.321227,0.029267,1.601639,3.0,-1.524433,1.004025,-0.571713
717,-0.372405,0.546739,-1.422648,1.582601,3.0,0.264721,0.792927
752,-0.051312,0.707953,3.0,-0.063165,-0.068358,-0.003327,0.185698


In [48]:
np.sign(data).head()

Unnamed: 0,0,1,2,3,4,5,6
0,1.0,1.0,1.0,1.0,1.0,-1.0,-1.0
1,1.0,1.0,1.0,1.0,-1.0,-1.0,-1.0
2,-1.0,1.0,1.0,-1.0,1.0,1.0,-1.0
3,-1.0,-1.0,-1.0,-1.0,1.0,1.0,1.0
4,-1.0,-1.0,1.0,1.0,1.0,-1.0,-1.0


In [50]:
pd.options.display.max_rows = 20
np.random.seed(1223)
np.set_printoptions(precision=4, suppress =True)

In [54]:
data = pd.Series(np.random.randn(9), index = [['a','a','a','b','b', 'c','c','c','c'], [1,2,3,1,3,1,1,3,3]])

In [55]:
data

a  1    0.809335
   2    0.955799
   3   -0.421880
b  1    0.346936
   3   -0.200127
c  1   -0.004921
   1    0.209049
   3   -0.171220
   3   -1.710489
dtype: float64

In [56]:
data.index


MultiIndex([('a', 1),
            ('a', 2),
            ('a', 3),
            ('b', 1),
            ('b', 3),
            ('c', 1),
            ('c', 1),
            ('c', 3),
            ('c', 3)],
           )

In [57]:
data['b']

1    0.346936
3   -0.200127
dtype: float64

In [58]:
data[['b','c']]

b  1    0.346936
   3   -0.200127
c  1   -0.004921
   1    0.209049
   3   -0.171220
   3   -1.710489
dtype: float64

In [60]:
data['a':'b']

a  1    0.809335
   2    0.955799
   3   -0.421880
b  1    0.346936
   3   -0.200127
dtype: float64

In [62]:
data.loc[:,2]

a    0.955799
dtype: float64

In [64]:
data.loc[:, 3]

a   -0.421880
b   -0.200127
c   -0.171220
c   -1.710489
dtype: float64

In [68]:
data = pd.Series(np.random.randn(9),
index=[['a', 'a', 'a', 'b', 'b', 'c', 'c', 'd', 'd'],
[1, 2, 3, 1, 3, 1, 2, 2, 3]])
data


a  1    0.344603
   2   -0.118409
   3    0.995263
b  1    0.893329
   3    1.972521
c  1    0.157336
   2   -0.623562
d  2    0.039139
   3   -1.730728
dtype: float64

In [69]:
data.unstack()

Unnamed: 0,1,2,3
a,0.344603,-0.118409,0.995263
b,0.893329,,1.972521
c,0.157336,-0.623562,
d,,0.039139,-1.730728


In [71]:
data.index

MultiIndex([('a', 1),
            ('a', 2),
            ('a', 3),
            ('b', 1),
            ('b', 3),
            ('c', 1),
            ('c', 2),
            ('d', 2),
            ('d', 3)],
           )

In [72]:
data['b']

1    0.893329
3    1.972521
dtype: float64

In [73]:
data['b': 'd']

b  1    0.893329
   3    1.972521
c  1    0.157336
   2   -0.623562
d  2    0.039139
   3   -1.730728
dtype: float64

In [84]:
data.loc[[3]]

Series([], dtype: float64)

In [80]:
data.loc[['b', 'd']]

b  1    0.893329
   3    1.972521
d  2    0.039139
   3   -1.730728
dtype: float64

In [85]:
data.loc['a']

1    0.344603
2   -0.118409
3    0.995263
dtype: float64

In [86]:
data.loc[:, 2]

a   -0.118409
c   -0.623562
d    0.039139
dtype: float64

In [87]:
data.unstack()

Unnamed: 0,1,2,3
a,0.344603,-0.118409,0.995263
b,0.893329,,1.972521
c,0.157336,-0.623562,
d,,0.039139,-1.730728


In [88]:
data.unstack().stack()

a  1    0.344603
   2   -0.118409
   3    0.995263
b  1    0.893329
   3    1.972521
c  1    0.157336
   2   -0.623562
d  2    0.039139
   3   -1.730728
dtype: float64

In [90]:
df = pd.DataFrame(np.arange(12).reshape((4,-1)), 
                  index = ['one', 'two', 'three', 'four'], 
                  columns = ['aaa', 'bbb', 'ccc'])

In [91]:
df

Unnamed: 0,aaa,bbb,ccc
one,0,1,2
two,3,4,5
three,6,7,8
four,9,10,11


In [100]:
df = pd.DataFrame(np.arange(12).reshape((4,-1)), 
                  index = [['one', 'two', 'three', 'four'], [1,2,1,2]], 
                  columns = [['ab', 'ab', 'cc'] , ['aaa', 'bbb', 'ccc']])

In [101]:
df

Unnamed: 0_level_0,Unnamed: 1_level_0,ab,ab,cc
Unnamed: 0_level_1,Unnamed: 1_level_1,aaa,bbb,ccc
one,1,0,1,2
two,2,3,4,5
three,1,6,7,8
four,2,9,10,11


In [102]:
df.index

MultiIndex([(  'one', 1),
            (  'two', 2),
            ('three', 1),
            ( 'four', 2)],
           )

In [103]:
df.index.names

FrozenList([None, None])

In [104]:
df.index.names = ['key1', 'key2']

In [105]:
df

Unnamed: 0_level_0,Unnamed: 1_level_0,ab,ab,cc
Unnamed: 0_level_1,Unnamed: 1_level_1,aaa,bbb,ccc
key1,key2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2
one,1,0,1,2
two,2,3,4,5
three,1,6,7,8
four,2,9,10,11


In [106]:
df.columns.names

FrozenList([None, None])

In [109]:
df.columns.names = ['col_key1', 'col_key2']

In [110]:
df

Unnamed: 0_level_0,col_key1,ab,ab,cc
Unnamed: 0_level_1,col_key2,aaa,bbb,ccc
key1,key2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2
one,1,0,1,2
two,2,3,4,5
three,1,6,7,8
four,2,9,10,11


In [111]:
df.columns

MultiIndex([('ab', 'aaa'),
            ('ab', 'bbb'),
            ('cc', 'ccc')],
           names=['col_key1', 'col_key2'])

In [114]:
df.columns.names = [None,None]

In [115]:
df.columns

MultiIndex([('ab', 'aaa'),
            ('ab', 'bbb'),
            ('cc', 'ccc')],
           )

In [116]:
df.columns.names

FrozenList([None, None])

In [117]:
df.columns.names = ['col1_key', 'col2_key']

In [118]:
df.columns.names

FrozenList(['col1_key', 'col2_key'])

In [119]:
df['ab']

Unnamed: 0_level_0,col2_key,aaa,bbb
key1,key2,Unnamed: 2_level_1,Unnamed: 3_level_1
one,1,0,1
two,2,3,4
three,1,6,7
four,2,9,10


In [120]:
df['cc']

Unnamed: 0_level_0,col2_key,ccc
key1,key2,Unnamed: 2_level_1
one,1,2
two,2,5
three,1,8
four,2,11


In [122]:
df.loc['one']

col1_key,ab,ab,cc
col2_key,aaa,bbb,ccc
key2,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2
1,0,1,2


In [123]:
df.loc[['one', 'three']]

Unnamed: 0_level_0,col1_key,ab,ab,cc
Unnamed: 0_level_1,col2_key,aaa,bbb,ccc
key1,key2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2
one,1,0,1,2
three,1,6,7,8


In [125]:
df.swaplevel('key1', 'key2')

Unnamed: 0_level_0,col1_key,ab,ab,cc
Unnamed: 0_level_1,col2_key,aaa,bbb,ccc
key2,key1,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2
1,one,0,1,2
2,two,3,4,5
1,three,6,7,8
2,four,9,10,11


In [126]:
df

Unnamed: 0_level_0,col1_key,ab,ab,cc
Unnamed: 0_level_1,col2_key,aaa,bbb,ccc
key1,key2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2
one,1,0,1,2
two,2,3,4,5
three,1,6,7,8
four,2,9,10,11


In [127]:
df.sort_index(level=1)

Unnamed: 0_level_0,col1_key,ab,ab,cc
Unnamed: 0_level_1,col2_key,aaa,bbb,ccc
key1,key2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2
one,1,0,1,2
three,1,6,7,8
four,2,9,10,11
two,2,3,4,5


In [128]:
df.sort_index(level=0)

Unnamed: 0_level_0,col1_key,ab,ab,cc
Unnamed: 0_level_1,col2_key,aaa,bbb,ccc
key1,key2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2
four,2,9,10,11
one,1,0,1,2
three,1,6,7,8
two,2,3,4,5


In [129]:
df.sort_index(level=0, axis=1)

Unnamed: 0_level_0,col1_key,ab,ab,cc
Unnamed: 0_level_1,col2_key,aaa,bbb,ccc
key1,key2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2
one,1,0,1,2
two,2,3,4,5
three,1,6,7,8
four,2,9,10,11


In [130]:
df.columns = [['deep','deep', 'ai'],['front','rest','ampersand']]

In [132]:
df

Unnamed: 0_level_0,Unnamed: 1_level_0,deep,deep,ai
Unnamed: 0_level_1,Unnamed: 1_level_1,front,rest,ampersand
key1,key2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2
one,1,0,1,2
two,2,3,4,5
three,1,6,7,8
four,2,9,10,11


In [133]:
df.sort_index(level=0, axis=1)

Unnamed: 0_level_0,Unnamed: 1_level_0,ai,deep,deep
Unnamed: 0_level_1,Unnamed: 1_level_1,ampersand,front,rest
key1,key2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2
one,1,2,0,1
two,2,5,3,4
three,1,8,6,7
four,2,11,9,10


In [134]:
df.sort_index(level=1, axis=1)

Unnamed: 0_level_0,Unnamed: 1_level_0,ai,deep,deep
Unnamed: 0_level_1,Unnamed: 1_level_1,ampersand,front,rest
key1,key2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2
one,1,2,0,1
two,2,5,3,4
three,1,8,6,7
four,2,11,9,10


In [135]:
df.swaplevel('key1', 'key2')

Unnamed: 0_level_0,Unnamed: 1_level_0,deep,deep,ai
Unnamed: 0_level_1,Unnamed: 1_level_1,front,rest,ampersand
key2,key1,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2
1,one,0,1,2
2,two,3,4,5
1,three,6,7,8
2,four,9,10,11


In [136]:
df.swaplevel(0,1)

Unnamed: 0_level_0,Unnamed: 1_level_0,deep,deep,ai
Unnamed: 0_level_1,Unnamed: 1_level_1,front,rest,ampersand
key2,key1,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2
1,one,0,1,2
2,two,3,4,5
1,three,6,7,8
2,four,9,10,11


In [137]:
df.swaplevel(0,1).sort_index(level=0)

Unnamed: 0_level_0,Unnamed: 1_level_0,deep,deep,ai
Unnamed: 0_level_1,Unnamed: 1_level_1,front,rest,ampersand
key2,key1,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2
1,one,0,1,2
1,three,6,7,8
2,four,9,10,11
2,two,3,4,5
