In [1]:
import numpy as np
import pandas as pd
from pandas import Series, DataFrame

In [2]:
import pandas.util.testing as tm; tm.N = 3

#Create a unpivoted function
def unpivot(frame):
    N, K = frame.shape
    
    data = {'value' : frame.values.ravel('F'),
            'variable' : np.asarray(frame.columns).repeat(N),
            'date' : np.tile(np.asarray(frame.index), K)}
    
    # Return the DataFrame
    return DataFrame(data, columns=['date', 'variable', 'value'])

#Set the DataFrame we'll be using
dframe = unpivot(tm.makeTimeDataFrame())

In [3]:
dframe

Unnamed: 0,date,variable,value
0,2000-01-03,A,2.080549
1,2000-01-04,A,-0.223059
2,2000-01-05,A,-0.285362
3,2000-01-03,B,-0.221894
4,2000-01-04,B,-0.369872
5,2000-01-05,B,1.024639
6,2000-01-03,C,1.678888
7,2000-01-04,C,0.651709
8,2000-01-05,C,-0.236948
9,2000-01-03,D,1.249803


In [4]:
dframe_piv = dframe.pivot(index = 'date', columns='variable', values='value')

In [5]:
dframe_piv

variable,A,B,C,D
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
2000-01-03,2.080549,-0.221894,1.678888,1.249803
2000-01-04,-0.223059,-0.369872,0.651709,-0.15188
2000-01-05,-0.285362,1.024639,-0.236948,-0.726972


In [6]:
#Duplicates

In [7]:
dframe = DataFrame({'key1': ['A'] * 2 + ['B'] * 3,
                  'key2': [2, 2, 2, 3, 3]})


In [8]:
dframe

Unnamed: 0,key1,key2
0,A,2
1,A,2
2,B,2
3,B,3
4,B,3


In [9]:
dframe.duplicated()

0    False
1     True
2    False
3    False
4     True
dtype: bool

In [10]:
dframe.drop_duplicates()

Unnamed: 0,key1,key2
0,A,2
2,B,2
3,B,3


In [12]:
dframe.drop_duplicates(['key1'])

Unnamed: 0,key1,key2
0,A,2
2,B,2


In [14]:
dframe.drop_duplicates(['key1'],keep='last')

Unnamed: 0,key1,key2
1,A,2
4,B,3


In [15]:
#Mapping

In [16]:
dframe = DataFrame({'city':['Alma','Brian Head','Fox Park'],
                    'altitude':[3158,3000,2762]})

In [17]:
dframe

Unnamed: 0,city,altitude
0,Alma,3158
1,Brian Head,3000
2,Fox Park,2762


In [21]:
state_map = {'Alma':'Colorado','Brian Head': 'Utah'}

In [22]:
dframe['state'] = dframe['city'].map(state_map)

In [23]:
dframe

Unnamed: 0,city,altitude,state
0,Alma,3158,Colorado
1,Brian Head,3000,Utah
2,Fox Park,2762,


In [24]:
#replace

In [25]:
ser1=  Series([1,2,3,4,1,2,3,4])
ser1

0    1
1    2
2    3
3    4
4    1
5    2
6    3
7    4
dtype: int64

In [26]:
ser1.replace(1,np.nan)

0    NaN
1    2.0
2    3.0
3    4.0
4    NaN
5    2.0
6    3.0
7    4.0
dtype: float64

In [33]:
ser1.replace([1,3],['a','b'])

0    a
1    2
2    b
3    4
4    a
5    2
6    b
7    4
dtype: object

In [34]:
ser1.replace({2:np.nan})

0    1.0
1    NaN
2    3.0
3    4.0
4    1.0
5    NaN
6    3.0
7    4.0
dtype: float64

In [35]:
#rename index

In [38]:
dframe = DataFrame(np.arange(12).reshape(3,4),index =['NY','LA','SF'],columns = ['A','B','C','D'])

In [39]:
dframe

Unnamed: 0,A,B,C,D
NY,0,1,2,3
LA,4,5,6,7
SF,8,9,10,11


In [49]:
dframe.index = dframe.index.map(str.lower)

In [50]:
dframe

Unnamed: 0,A,B,C,D
ny,0,1,2,3
la,4,5,6,7
sf,8,9,10,11


In [53]:
dframe.rename(index = str.title,columns=str.lower,inplace = True)

In [54]:
dframe

Unnamed: 0,a,b,c,d
Ny,0,1,2,3
La,4,5,6,7
Sf,8,9,10,11


In [58]:
dframe.rename(index = {'Ny':'New York'},columns = {'a':'Alpha'},inplace=True)
dframe

Unnamed: 0,Alpha,b,c,d
New York,0,1,2,3
La,4,5,6,7
Sf,8,9,10,11


In [59]:
#Binning

In [60]:
years = [1990,1991,1992,2008,2012,2015,1987,1969,2013,2008,1999]

In [61]:
decade_bins = [1960,1970,1980,1990,2000,2010,2020]

In [63]:
decade_cat = pd.cut(years,decade_bins)
decade_cat

[(1980, 1990], (1990, 2000], (1990, 2000], (2000, 2010], (2010, 2020], ..., (1980, 1990], (1960, 1970], (2010, 2020], (2000, 2010], (1990, 2000]]
Length: 11
Categories (6, interval[int64]): [(1960, 1970] < (1970, 1980] < (1980, 1990] < (1990, 2000] < (2000, 2010] < (2010, 2020]]

In [64]:
decade_cat.categories

IntervalIndex([(1960, 1970], (1970, 1980], (1980, 1990], (1990, 2000], (2000, 2010], (2010, 2020]],
              closed='right',
              dtype='interval[int64]')

In [65]:
decade_cat.value_counts(decade_bins)

(1960, 1970]    1
(1970, 1980]    0
(1980, 1990]    2
(1990, 2000]    3
(2000, 2010]    2
(2010, 2020]    3
dtype: int64

In [68]:
pd.cut(years,bins=2,precision=1)

[(1969.0, 1992.0], (1969.0, 1992.0], (1969.0, 1992.0], (1992.0, 2015.0], (1992.0, 2015.0], ..., (1969.0, 1992.0], (1969.0, 1992.0], (1992.0, 2015.0], (1992.0, 2015.0], (1992.0, 2015.0]]
Length: 11
Categories (2, interval[float64]): [(1969.0, 1992.0] < (1992.0, 2015.0]]

In [69]:
#Outliers

In [2]:
np.random.seed(123)

In [15]:
dframe = DataFrame(np.random.randn(1000,4))
dframe

Unnamed: 0,0,1,2,3
0,-1.275877,-0.475048,1.732000,0.274019
1,-0.623326,1.543678,0.345436,-2.452778
2,0.393886,-0.348284,1.113102,1.140309
3,1.267204,1.093961,0.716824,0.727974
4,-0.824845,0.408178,-0.588906,2.486229
...,...,...,...,...
995,-2.506713,2.595257,1.228712,0.215831
996,0.928928,-0.878502,0.467351,-0.337109
997,-0.105172,1.513999,-0.440898,1.374422
998,1.652277,-1.635366,-1.133904,1.363655


In [17]:
dframe.describe()

Unnamed: 0,0,1,2,3
count,1000.0,1000.0,1000.0,1000.0
mean,0.028082,-0.033854,0.021224,0.033545
std,1.007383,1.003525,1.016456,1.004112
min,-3.080841,-3.388116,-2.78829,-3.13454
25%,-0.623886,-0.743271,-0.65726,-0.658904
50%,0.02295,-0.054645,0.021022,0.046431
75%,0.637991,0.676547,0.699898,0.690236
max,3.525945,2.959493,3.363617,2.829343


In [21]:
col = dframe[0]
col

0     -1.275877
1     -0.623326
2      0.393886
3      1.267204
4     -0.824845
         ...   
995   -2.506713
996    0.928928
997   -0.105172
998    1.652277
999   -2.221896
Name: 0, Length: 1000, dtype: float64

In [26]:
col[np.abs(col)>3]

145    3.268255
225   -3.080841
238    3.033925
301    3.123607
315    3.525945
456    3.153396
866    3.010778
Name: 0, dtype: float64

In [32]:
dframe[(np.abs(dframe)>3).any(1)]

Unnamed: 0,0,1,2,3
54,0.183977,-3.388116,-2.759659,0.019657
145,3.268255,-1.140417,-0.901446,-0.170055
225,-3.080841,1.455828,-0.635067,0.393876
227,0.403473,1.124247,3.180927,0.368362
238,3.033925,0.611257,0.991568,1.498762
301,3.123607,-1.398992,-0.836612,0.688468
315,3.525945,1.020559,0.651604,-1.247385
456,3.153396,-0.857727,-2.212731,-1.316734
595,1.147896,0.706748,1.63342,-3.033117
808,-0.988894,0.247688,3.363617,0.880421


In [33]:
dframe[np.abs(dframe)>3] = np.sign(dframe)*3

In [34]:
dframe[(np.abs(dframe) == 3).any(1)]

Unnamed: 0,0,1,2,3
54,0.183977,-3.0,-2.759659,0.019657
145,3.0,-1.140417,-0.901446,-0.170055
225,-3.0,1.455828,-0.635067,0.393876
227,0.403473,1.124247,3.0,0.368362
238,3.0,0.611257,0.991568,1.498762
301,3.0,-1.398992,-0.836612,0.688468
315,3.0,1.020559,0.651604,-1.247385
456,3.0,-0.857727,-2.212731,-1.316734
595,1.147896,0.706748,1.63342,-3.0
808,-0.988894,0.247688,3.0,0.880421


In [35]:
dframe.describe()

Unnamed: 0,0,1,2,3
count,1000.0,1000.0,1000.0,1000.0
mean,0.027047,-0.033466,0.02068,0.033712
std,1.003641,1.0023,1.014776,1.003595
min,-3.0,-3.0,-2.78829,-3.0
25%,-0.623886,-0.743271,-0.65726,-0.658904
50%,0.02295,-0.054645,0.021022,0.046431
75%,0.637991,0.676547,0.699898,0.690236
max,3.0,2.959493,3.0,2.829343


In [37]:
dframe = DataFrame(np.arange(16).reshape(4,4))
dframe

Unnamed: 0,0,1,2,3
0,0,1,2,3
1,4,5,6,7
2,8,9,10,11
3,12,13,14,15


In [38]:
blender = np.random.permutation(4)
blender

array([2, 0, 1, 3])

In [39]:
dframe.take(blender)

Unnamed: 0,0,1,2,3
2,8,9,10,11
0,0,1,2,3
1,4,5,6,7
3,12,13,14,15


In [40]:
box = np.array([1,2,3])

In [41]:
shaker = np.random.randint(0,len(box),size = 10)
shaker

array([0, 2, 2, 2, 0, 2, 2, 0, 1, 1])

In [42]:
hand_grabs = box.take(shaker)
hand_grabs

array([1, 3, 3, 3, 1, 3, 3, 1, 2, 2])