In [1]:
import numpy as np
import pandas as pd
from pandas import Series, DataFrame

In [2]:
import pandas.util.testing as tm; tm.N = 3

#Create a unpivoted function
def unpivot(frame):
    N, K = frame.shape
    
    data = {'value' : frame.values.ravel('F'),
            'variable' : np.asarray(frame.columns).repeat(N),
            'date' : np.tile(np.asarray(frame.index), K)}
    
    # Return the DataFrame
    return DataFrame(data, columns=['date', 'variable', 'value'])

#Set the DataFrame we'll be using
dframe = unpivot(tm.makeTimeDataFrame())

In [3]:
dframe

Unnamed: 0,date,variable,value
0,2000-01-03,A,-0.976114
1,2000-01-04,A,-0.590094
2,2000-01-05,A,-0.726883
3,2000-01-03,B,1.718127
4,2000-01-04,B,0.849777
5,2000-01-05,B,-0.586377
6,2000-01-03,C,0.188078
7,2000-01-04,C,-0.398802
8,2000-01-05,C,0.183237
9,2000-01-03,D,-0.394202


In [4]:
dframe_piv = dframe.pivot(index = 'date', columns='variable', values='value')

In [5]:
dframe_piv

variable,A,B,C,D
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
2000-01-03,-0.976114,1.718127,0.188078,-0.394202
2000-01-04,-0.590094,0.849777,-0.398802,-0.469794
2000-01-05,-0.726883,-0.586377,0.183237,0.232284


In [6]:
#Duplicates

In [6]:
dframe = DataFrame({'key1': ['A'] * 2 + ['B'] * 3,
                  'key2': [2, 2, 2, 3, 3]})


In [7]:
dframe

Unnamed: 0,key1,key2
0,A,2
1,A,2
2,B,2
3,B,3
4,B,3


In [8]:
dframe.duplicated()

0    False
1     True
2    False
3    False
4     True
dtype: bool

In [9]:
dframe.drop_duplicates()

Unnamed: 0,key1,key2
0,A,2
2,B,2
3,B,3


In [10]:
dframe.drop_duplicates(['key1'])

Unnamed: 0,key1,key2
0,A,2
2,B,2


In [11]:
dframe.drop_duplicates(['key1'],keep='last')

Unnamed: 0,key1,key2
1,A,2
4,B,3


In [15]:
#Mapping

In [12]:
dframe = DataFrame({'city':['Alma','Brian Head','Fox Park'],
                    'altitude':[3158,3000,2762]})

In [13]:
dframe

Unnamed: 0,city,altitude
0,Alma,3158
1,Brian Head,3000
2,Fox Park,2762


In [21]:
state_map = {'Alma':'Colorado','Brian Head': 'Utah'}

In [22]:
dframe['state'] = dframe['city'].map(state_map)

In [23]:
dframe

Unnamed: 0,city,altitude,state
0,Alma,3158,Colorado
1,Brian Head,3000,Utah
2,Fox Park,2762,


In [24]:
#replace

In [14]:
ser1=  Series([1,2,3,4,1,2,3,4])
ser1

0    1
1    2
2    3
3    4
4    1
5    2
6    3
7    4
dtype: int64

In [15]:
ser1.replace(1,np.nan)

0    NaN
1    2.0
2    3.0
3    4.0
4    NaN
5    2.0
6    3.0
7    4.0
dtype: float64

In [33]:
ser1.replace([1,3],['a','b'])

0    a
1    2
2    b
3    4
4    a
5    2
6    b
7    4
dtype: object

In [34]:
ser1.replace({2:np.nan})

0    1.0
1    NaN
2    3.0
3    4.0
4    1.0
5    NaN
6    3.0
7    4.0
dtype: float64

In [35]:
#rename index

In [16]:
dframe = DataFrame(np.arange(12).reshape(3,4),index =['NY','LA','SF'],columns = ['A','B','C','D'])

In [17]:
dframe

Unnamed: 0,A,B,C,D
NY,0,1,2,3
LA,4,5,6,7
SF,8,9,10,11


In [18]:
dframe.index = dframe.index.map(str.lower)

In [19]:
dframe

Unnamed: 0,A,B,C,D
ny,0,1,2,3
la,4,5,6,7
sf,8,9,10,11


In [20]:
dframe.rename(index = str.title,columns=str.lower,inplace = True)

In [21]:
dframe

Unnamed: 0,a,b,c,d
Ny,0,1,2,3
La,4,5,6,7
Sf,8,9,10,11


In [58]:
dframe.rename(index = {'Ny':'New York'},columns = {'a':'Alpha'},inplace=True)
dframe

Unnamed: 0,Alpha,b,c,d
New York,0,1,2,3
La,4,5,6,7
Sf,8,9,10,11


In [59]:
#Binning

In [22]:
years = [1990,1991,1992,2008,2012,2015,1987,1969,2013,2008,1999]

In [23]:
decade_bins = [1960,1970,1980,1990,2000,2010,2020]

In [24]:
decade_cat = pd.cut(years,decade_bins)
decade_cat

[(1980, 1990], (1990, 2000], (1990, 2000], (2000, 2010], (2010, 2020], ..., (1980, 1990], (1960, 1970], (2010, 2020], (2000, 2010], (1990, 2000]]
Length: 11
Categories (6, interval[int64]): [(1960, 1970] < (1970, 1980] < (1980, 1990] < (1990, 2000] < (2000, 2010] < (2010, 2020]]

In [64]:
decade_cat.categories

IntervalIndex([(1960, 1970], (1970, 1980], (1980, 1990], (1990, 2000], (2000, 2010], (2010, 2020]],
              closed='right',
              dtype='interval[int64]')

In [27]:
decade_cat.value_counts(decade_bins)

(1960, 1970]    1
(1970, 1980]    0
(1980, 1990]    2
(1990, 2000]    3
(2000, 2010]    2
(2010, 2020]    3
dtype: int64

In [68]:
pd.cut(years,bins=2,precision=1)

[(1969.0, 1992.0], (1969.0, 1992.0], (1969.0, 1992.0], (1992.0, 2015.0], (1992.0, 2015.0], ..., (1969.0, 1992.0], (1969.0, 1992.0], (1992.0, 2015.0], (1992.0, 2015.0], (1992.0, 2015.0]]
Length: 11
Categories (2, interval[float64]): [(1969.0, 1992.0] < (1992.0, 2015.0]]

In [69]:
#Outliers

In [42]:
np.random.seed(123)

In [43]:
dframe = DataFrame(np.random.randn(1000,4))
dframe.head()

Unnamed: 0,0,1,2,3
0,-1.085631,0.997345,0.282978,-1.506295
1,-0.5786,1.651437,-2.426679,-0.428913
2,1.265936,-0.86674,-0.678886,-0.094709
3,1.49139,-0.638902,-0.443982,-0.434351
4,2.20593,2.186786,1.004054,0.386186


In [44]:
dframe.describe()

Unnamed: 0,0,1,2,3
count,1000.0,1000.0,1000.0,1000.0
mean,-0.007502,0.03916,-0.010286,0.024285
std,0.977024,0.973484,1.01223,0.970421
min,-3.167055,-2.920029,-3.801378,-3.231055
25%,-0.662012,-0.63616,-0.687717,-0.599195
50%,-0.024843,0.062549,0.007035,0.038718
75%,0.61395,0.672448,0.664586,0.683228
max,3.050755,2.850708,2.766603,3.571579


In [45]:
col = dframe[0]
col.head(10)

0   -1.085631
1   -0.578600
2    1.265936
3    1.491390
4    2.205930
5    0.737369
6   -1.253881
7   -0.140069
8   -1.771533
9    0.002846
Name: 0, dtype: float64

In [46]:
col[np.abs(col)>3]

235   -3.167055
910    3.050755
Name: 0, dtype: float64

In [47]:
dframe[(np.abs(dframe)>3).any(1)]

Unnamed: 0,0,1,2,3
48,0.199582,-0.126118,0.197019,-3.231055
182,0.272735,0.425336,-0.230904,3.571579
235,-3.167055,-0.713989,-1.112364,-1.254184
409,0.151037,0.069403,-3.801378,-1.127172
423,0.231228,1.076113,-3.587494,1.148869
510,0.506533,0.644099,-3.066988,-1.349275
910,3.050755,0.296552,-0.481843,0.930787


In [48]:
dframe[np.abs(dframe)>3] = np.sign(dframe)*3

In [49]:
dframe[(np.abs(dframe) == 3).any(1)]

Unnamed: 0,0,1,2,3
48,0.199582,-0.126118,0.197019,-3.0
182,0.272735,0.425336,-0.230904,3.0
235,-3.0,-0.713989,-1.112364,-1.254184
409,0.151037,0.069403,-3.0,-1.127172
423,0.231228,1.076113,-3.0,1.148869
510,0.506533,0.644099,-3.0,-1.349275
910,3.0,0.296552,-0.481843,0.930787


In [50]:
dframe.describe()

Unnamed: 0,0,1,2,3
count,1000.0,1000.0,1000.0,1000.0
mean,-0.007386,0.03916,-0.008831,0.023944
std,0.97634,0.973484,1.007422,0.967746
min,-3.0,-2.920029,-3.0,-3.0
25%,-0.662012,-0.63616,-0.687717,-0.599195
50%,-0.024843,0.062549,0.007035,0.038718
75%,0.61395,0.672448,0.664586,0.683228
max,3.0,2.850708,2.766603,3.0


In [37]:
dframe = DataFrame(np.arange(16).reshape(4,4))
dframe"

Unnamed: 0,0,1,2,3
0,0,1,2,3
1,4,5,6,7
2,8,9,10,11
3,12,13,14,15


In [38]:
blender = np.random.permutation(4)
blender

array([2, 0, 1, 3])

In [39]:
dframe.take(blender)

Unnamed: 0,0,1,2,3
2,8,9,10,11
0,0,1,2,3
1,4,5,6,7
3,12,13,14,15


In [40]:
box = np.array([1,2,3])

In [41]:
shaker = np.random.randint(0,len(box),size = 10)
shaker

array([0, 2, 2, 2, 0, 2, 2, 0, 1, 1])

In [42]:
hand_grabs = box.take(shaker)
hand_grabs

array([1, 3, 3, 3, 1, 3, 3, 1, 2, 2])