In [1]:
import pandas as pd
import numpy as np
from pandas import Series, DataFrame

In [2]:
years = [1990,1991,1992,2008,2012,2015,1987,1969,2013,2008,1999]
decade_bins = [1960,1970,1980,1990,2000,2010,2020]

In [3]:
#Category objects to do binning
decade_cat = pd.cut(years, decade_bins)
decade_cat

[(1980, 1990], (1990, 2000], (1990, 2000], (2000, 2010], (2010, 2020], ..., (1980, 1990], (1960, 1970], (2010, 2020], (2000, 2010], (1990, 2000]]
Length: 11
Categories (6, interval[int64]): [(1960, 1970] < (1970, 1980] < (1980, 1990] < (1990, 2000] < (2000, 2010] < (2010, 2020]]

In [4]:
decade_cat.categories

IntervalIndex([(1960, 1970], (1970, 1980], (1980, 1990], (1990, 2000], (2000, 2010], (2010, 2020]]
              closed='right',
              dtype='interval[int64]')

In [5]:
pd.value_counts(decade_cat)

(2010, 2020]    3
(1990, 2000]    3
(2000, 2010]    2
(1980, 1990]    2
(1960, 1970]    1
(1970, 1980]    0
dtype: int64

In [7]:
pd.cut(years, 2, precision=1)

[(1969.0, 1992.0], (1969.0, 1992.0], (1969.0, 1992.0], (1992.0, 2015.0], (1992.0, 2015.0], ..., (1969.0, 1992.0], (1969.0, 1992.0], (1992.0, 2015.0], (1992.0, 2015.0], (1992.0, 2015.0]]
Length: 11
Categories (2, interval[float64]): [(1969.0, 1992.0] < (1992.0, 2015.0]]

# Outliers

In [8]:
np.random.seed(12345)

In [11]:
df = DataFrame(np.random.randn(1000,4))
df.head()

Unnamed: 0,0,1,2,3
0,-1.16653,-0.075264,0.112345,0.166874
1,0.012628,0.815313,-0.732001,0.868791
2,0.149693,0.485218,0.161056,-1.068808
3,1.190359,-1.053204,0.776001,1.31126
4,1.159677,0.477395,-0.004493,0.574631


In [12]:
df.tail()

Unnamed: 0,0,1,2,3
995,-1.530608,0.058991,-0.33712,1.129394
996,-0.151214,-0.898121,-1.467595,-1.086964
997,-0.104017,1.442214,-1.713617,0.936543
998,1.932427,0.530258,-0.217213,0.988951
999,0.193746,-1.744483,0.761804,-1.544201


In [13]:
df.describe()

Unnamed: 0,0,1,2,3
count,1000.0,1000.0,1000.0,1000.0
mean,-0.039859,-0.03335,-0.011749,-0.031858
std,1.004647,0.998154,0.987471,1.01359
min,-3.530912,-3.02411,-3.170292,-3.105636
25%,-0.738643,-0.698731,-0.692887,-0.75801
50%,0.011608,-0.016171,0.02257,-0.035527
75%,0.6638,0.605952,0.666451,0.693491
max,2.916153,3.061029,2.623689,3.144389


In [14]:
col = df[0]
col.head()

0   -1.166530
1    0.012628
2    0.149693
3    1.190359
4    1.159677
Name: 0, dtype: float64

In [15]:
#Chcecking which absolute values in the column are greater than 3
col[np.abs(col)>3]

421   -3.530912
Name: 0, dtype: float64

In [16]:
df[(np.abs(df)>3).any(1)]
#Returns every row in the df that had absolute value > 3

Unnamed: 0,0,1,2,3
158,0.623798,-0.436479,0.901529,-3.044612
192,0.617561,-1.148738,-3.170292,-1.017073
348,0.813014,-1.202724,-0.286215,-3.105636
360,0.123291,-3.02411,-1.168413,-0.888664
421,-3.530912,-0.576175,-0.750648,0.025443
712,-0.928871,3.061029,-0.297909,0.990886
787,0.836054,-0.78062,0.622791,3.144389


In [19]:
df[(np.abs(df)>3)] = np.sign(df)*3
df.describe()

Unnamed: 0,0,1,2,3
count,1000.0,1000.0,1000.0,1000.0
mean,-0.039328,-0.033386,-0.011579,-0.031852
std,1.002939,0.997895,0.98694,1.0127
min,-3.0,-3.0,-3.0,-3.0
25%,-0.738643,-0.698731,-0.692887,-0.75801
50%,0.011608,-0.016171,0.02257,-0.035527
75%,0.6638,0.605952,0.666451,0.693491
max,2.916153,3.0,2.623689,3.0


# Permutation

In [23]:
df = DataFrame(np.arange(16).reshape(4,4))
df

Unnamed: 0,0,1,2,3
0,0,1,2,3
1,4,5,6,7
2,8,9,10,11
3,12,13,14,15


In [26]:
blender = np.random.permutation(4)
blender

array([1, 0, 3, 2])

In [27]:
df.take(blender)

Unnamed: 0,0,1,2,3
1,4,5,6,7
0,0,1,2,3
3,12,13,14,15
2,8,9,10,11


In [28]:
box = np.array([1,2,3])
shaker = np.random.randint(0,len(box),size=10)

In [29]:
shaker

array([0, 2, 0, 1, 2, 0, 0, 0, 0, 1])

In [30]:
hand_grabs = box.take(shaker)
hand_grabs

array([1, 3, 1, 2, 3, 1, 1, 1, 1, 2])