# file read and write

In [1]:
import numpy as np
from pandas import Series,DataFrame
import pandas as pd

In [3]:
# Can open csv files as a dataframe
dframe = pd.read_csv('data/lec25.csv')

#Show
dframe

Unnamed: 0,q,r,s,t,apple
0,2,3,4,5,pear
1,a,s,d,f,rabbit
2,5,2,5,7,dog


In [5]:
# Can also use read_table with ',' as a delimiter
dframe = pd.read_table('data/lec25.csv',sep=',')

#Show
dframe

Unnamed: 0,q,r,s,t,apple
0,2,3,4,5,pear
1,a,s,d,f,rabbit
2,5,2,5,7,dog


In [6]:
#If we dont want the header to be the first row
dframe = pd.read_csv('data/lec25.csv',header=None)

#Show
dframe

Unnamed: 0,0,1,2,3,4
0,q,r,s,t,apple
1,2,3,4,5,pear
2,a,s,d,f,rabbit
3,5,2,5,7,dog


In [8]:
# We can also indicate a particular number of rows to be read
# 只读两行
pd.read_csv('data/lec25.csv',header=None,nrows=2)

Unnamed: 0,0,1,2,3,4
0,q,r,s,t,apple
1,2,3,4,5,pear


In [None]:
# Now let's see how we can write DataFrames out to text files
dframe.to_csv('mytextdata_out.csv')

#You'll see this file where you're ipython Notebooks are saved (Usually under my documents)

In [9]:
#  We can also use other delimiters

#we'll import sys to see the output
import sys 

#Use sys.stdout to see the output directly and not save it
dframe.to_csv(sys.stdout,sep='_')

_0_1_2_3_4
0_q_r_s_t_apple
1_2_3_4_5_pear
2_a_s_d_f_rabbit
3_5_2_5_7_dog


In [10]:
#We can also choose to write only a specific subset of columns
dframe.to_csv(sys.stdout,columns=[0,1,2])

,0,1,2
0,q,r,s
1,2,3,4
2,a,s,d
3,5,2,5


In [12]:
# 平时自己习惯用
dframe[[0,1,2]].to_csv(sys.stdout)

,0,1,2
0,q,r,s
1,2,3,4
2,a,s,d
3,5,2,5


# JSON

In [13]:
json_obj = """
{   "zoo_animal": "Lion",
    "food": ["Meat", "Veggies", "Honey"],
    "fur": "Golden",
    "clothes": null, 
    "diet": [{"zoo_animal": "Gazelle", "food":"grass", "fur": "Brown"}]
}
"""

In [14]:
#Let import json module
import json

#Lets load json data
data = json.loads(json_obj)

In [15]:
data

{u'clothes': None,
 u'diet': [{u'food': u'grass', u'fur': u'Brown', u'zoo_animal': u'Gazelle'}],
 u'food': [u'Meat', u'Veggies', u'Honey'],
 u'fur': u'Golden',
 u'zoo_animal': u'Lion'}

In [16]:
#WE can also convert back to JSON
json.dumps(data)

'{"food": ["Meat", "Veggies", "Honey"], "zoo_animal": "Lion", "fur": "Golden", "diet": [{"food": "grass", "zoo_animal": "Gazelle", "fur": "Brown"}], "clothes": null}'

In [17]:
#We can simply open JSON data after loading with a DataFrame
dframe = DataFrame(data['diet'])

In [18]:
dframe

Unnamed: 0,food,fur,zoo_animal
0,grass,Brown,Gazelle


# Merge

In [23]:
# Let's make a dframe

dframe1 = DataFrame({'key':['X','Z','Y','Z','X','X'],'data_set_1': np.arange(6)})

#Show
dframe1

Unnamed: 0,data_set_1,key
0,0,X
1,1,Z
2,2,Y
3,3,Z
4,4,X
5,5,X


In [24]:
#Now lets make another dframe

dframe2 = DataFrame({'key':['Q','Y','Z'],'data_set_2':[1,2,3]})

#Show
dframe2

Unnamed: 0,data_set_2,key
0,1,Q
1,2,Y
2,3,Z


In [25]:
# Now we can use merge the dataframes, this is a "many-to-one" situation

# Merge will automatically choose overlapping columns to merge on

# 默认上是inner merge，根据两个df中相同的column
pd.merge(dframe1,dframe2)

#Note no overlapping 'X's

Unnamed: 0,data_set_1,key,data_set_2
0,1,Z,3
1,3,Z,3
2,2,Y,2


In [26]:
# We could have also specified which column to merge on
# 一般我们需要明确的指定如何merge
pd.merge(dframe1,dframe2,on='key')

Unnamed: 0,data_set_1,key,data_set_2
0,1,Z,3
1,3,Z,3
2,2,Y,2


In [27]:
# We can choose which DataFrame's keys to use, this will choose left (dframe1)
# 这里指定使用左边的key作为参考，如果在被merge df没有出现的值，将使用NaN来填充
pd.merge(dframe1,dframe2,on='key',how='left')

Unnamed: 0,data_set_1,key,data_set_2
0,0,X,
1,1,Z,3.0
2,2,Y,2.0
3,3,Z,3.0
4,4,X,
5,5,X,


In [28]:
# Choosing the one on the right (dframe2)
pd.merge(dframe1,dframe2,on='key',how='right')

Unnamed: 0,data_set_1,key,data_set_2
0,1.0,Z,3
1,3.0,Z,3
2,2.0,Y,2
3,,Q,1


In [29]:
#Choosing the "outer" method selects the union of both keys
# 使用outer的话，我们将会得到key的并集
pd.merge(dframe1,dframe2,on='key',how='outer')

Unnamed: 0,data_set_1,key,data_set_2
0,0.0,X,
1,4.0,X,
2,5.0,X,
3,1.0,Z,3.0
4,3.0,Z,3.0
5,2.0,Y,2.0
6,,Q,1.0


In [30]:
#Now we'll learn about a many to many merge

# Note that these DataFrames contain more than one instance of the key in BOTH datasets

dframe3 = DataFrame({'key': ['X', 'X', 'X', 'Y', 'Z', 'Z'],
                 'data_set_3': range(6)})
dframe4 = DataFrame({'key': ['Y', 'Y', 'X', 'X', 'Z'],
                 'data_set_4': range(5)})

In [31]:
dframe3

Unnamed: 0,data_set_3,key
0,0,X
1,1,X
2,2,X
3,3,Y
4,4,Z
5,5,Z


In [32]:
dframe4

Unnamed: 0,data_set_4,key
0,0,Y
1,1,Y
2,2,X
3,3,X
4,4,Z


In [33]:
# 因为双方的key有比较多的重叠，合并后的条目有3*2 + 1*2 + 2*1 = 10
# 这就是many-to-many merge
pd.merge(dframe3, dframe4)

Unnamed: 0,data_set_3,key,data_set_4
0,0,X,2
1,0,X,3
2,1,X,2
3,1,X,3
4,2,X,2
5,2,X,3
6,3,Y,0
7,3,Y,1
8,4,Z,4
9,5,Z,4


In [34]:
# We can also merge with multiple keys!

# Dframe on left
df_left = DataFrame({'key1': ['SF', 'SF', 'LA'],
                  'key2': ['one', 'two', 'one'],
                  'left_data': [10,20,30]})

#Dframe on right
df_right = DataFrame({'key1': ['SF', 'SF', 'LA', 'LA'],
                   'key2': ['one', 'one', 'one', 'two'],
                   'right_data': [40,50,60,70]})

In [35]:
df_left

Unnamed: 0,key1,key2,left_data
0,SF,one,10
1,SF,two,20
2,LA,one,30


In [36]:
df_right

Unnamed: 0,key1,key2,right_data
0,SF,one,40
1,SF,one,50
2,LA,one,60
3,LA,two,70


In [37]:
#Merge
pd.merge(df_left, df_right, on=['key1', 'key2'], how='outer')

Unnamed: 0,key1,key2,left_data,right_data
0,SF,one,10.0,40.0
1,SF,one,10.0,50.0
2,SF,two,20.0,
3,LA,one,30.0,60.0
4,LA,two,,70.0


In [38]:
#Note that the left and right DataFrames have overlapping key names (key1 and key2).
# pandas automatically adds suffixes to them
# 遇到双方相同column名字的时候，会重命名column
pd.merge(df_left,df_right,on='key1')

Unnamed: 0,key1,key2_x,left_data,key2_y,right_data
0,SF,one,10,one,40
1,SF,one,10,one,50
2,SF,two,20,one,40
3,SF,two,20,one,50
4,LA,one,30,one,60
5,LA,one,30,two,70


In [39]:
# 可以指定column名
pd.merge(df_left,df_right, on='key1',suffixes=('_lefty','_righty'))

Unnamed: 0,key1,key2_lefty,left_data,key2_righty,right_data
0,SF,one,10,one,40
1,SF,one,10,one,50
2,SF,two,20,one,40
3,SF,two,20,one,50
4,LA,one,30,one,60
5,LA,one,30,two,70


# Merge on Index

In [40]:
# Lets get two dframes

df_left = DataFrame({'key': ['X','Y','Z','X','Y'],
                  'data': range(5)})
df_right = DataFrame({'group_data': [10, 20]}, index=['X', 'Y'])

In [41]:
df_left

Unnamed: 0,data,key
0,0,X
1,1,Y
2,2,Z
3,3,X
4,4,Y


In [42]:
df_right

Unnamed: 0,group_data
X,10
Y,20


In [46]:
# 其实就是各自指定双方各自的key， 而index也是可以被当做为keyy

In [43]:
#Now merge, we'll use the key for the left Dframe, and the index for the right
pd.merge(df_left,df_right,left_on='key',right_index=True)

Unnamed: 0,data,key,group_data
0,0,X,10
3,3,X,10
1,1,Y,20
4,4,Y,20


In [44]:
# We can also get a union by using outer
pd.merge(df_left,df_right,left_on='key',right_index=True,how='outer')

Unnamed: 0,data,key,group_data
0,0,X,10.0
3,3,X,10.0
1,1,Y,20.0
4,4,Y,20.0
2,2,Z,


In [45]:
#Now let's try something a little more complicated, remember hierarchal index?
df_left_hr = DataFrame({'key1': ['SF','SF','SF','LA','LA'],
                   'key2': [10, 20, 30, 20, 30],
                   'data_set': np.arange(5.)})
df_right_hr = DataFrame(np.arange(10).reshape((5, 2)),
                   index=[['LA','LA','SF','SF','SF'],
                          [20, 10, 10, 10, 20]],
                   columns=['col_1', 'col_2'])

In [47]:
#SHOW
df_left_hr

Unnamed: 0,data_set,key1,key2
0,0,SF,10
1,1,SF,20
2,2,SF,30
3,3,LA,20
4,4,LA,30


In [48]:
#Show, this has a index hierarchy
df_right_hr

Unnamed: 0,Unnamed: 1,col_1,col_2
LA,20,0,1
LA,10,2,3
SF,10,4,5
SF,10,6,7
SF,20,8,9


In [49]:
# Now we can merge the left by using keys and the right by its index
pd.merge(df_left_hr,df_right_hr,left_on=['key1','key2'],right_index=True)

Unnamed: 0,data_set,key1,key2,col_1,col_2
0,0,SF,10,4,5
0,0,SF,10,6,7
1,1,SF,20,8,9
3,3,LA,20,0,1


In [50]:
# We can alo keep a union by choosing 'outer' method
pd.merge(df_left_hr,df_right_hr,left_on=['key1','key2'],right_index=True,how='outer')

Unnamed: 0,data_set,key1,key2,col_1,col_2
0,0.0,SF,10,4.0,5.0
0,0.0,SF,10,6.0,7.0
1,1.0,SF,20,8.0,9.0
2,2.0,SF,30,,
3,3.0,LA,20,0.0,1.0
4,4.0,LA,30,,
4,,LA,10,2.0,3.0


# Concatenate

In [54]:
# First in just Numpy
# Create a matrix 
arr1 = np.arange(9).reshape((3,3))
arr1

array([[0, 1, 2],
       [3, 4, 5],
       [6, 7, 8]])

In [55]:
# Concatenate along axis 1
np.concatenate([arr1,arr1],axis=1)

array([[0, 1, 2, 0, 1, 2],
       [3, 4, 5, 3, 4, 5],
       [6, 7, 8, 6, 7, 8]])

In [56]:
# Let's see other axis options
np.concatenate([arr1,arr1],axis=0)

array([[0, 1, 2],
       [3, 4, 5],
       [6, 7, 8],
       [0, 1, 2],
       [3, 4, 5],
       [6, 7, 8]])

In [57]:
# 看看在pandas下的操作
ser1 =  Series([0,1,2],index=['T','U','V'])

ser2 = Series([3,4],index=['X','Y'])

#Now let use concat (default is axis=0)
pd.concat([ser1,ser2])

T    0
U    1
V    2
X    3
Y    4
dtype: int64

In [58]:
# Now passing along another axis will produce a DataFrame
pd.concat([ser1,ser2],axis=1)

Unnamed: 0,0,1
T,0.0,
U,1.0,
V,2.0,
X,,3.0
Y,,4.0


In [59]:
# We can specify which specific axes to be used
# 指定需要的index
pd.concat([ser1,ser2],axis=1,join_axes=[['U','V','Y']])

Unnamed: 0,0,1
U,1.0,
V,2.0,
Y,,4.0


In [60]:
# Lets say we wanted to add markers.keys to the concatenation result

# WE can do this with a hierarchical index
pd.concat([ser1,ser2],keys=['cat1','cat2'])

cat1  T    0
      U    1
      V    2
cat2  X    3
      Y    4
dtype: int64

In [61]:
# Along the axis=1 then these Keys become column headers
pd.concat([ser1,ser2],axis=1,keys=['cat1','cat2'])

Unnamed: 0,cat1,cat2
T,0.0,
U,1.0,
V,2.0,
X,,3.0
Y,,4.0


In [62]:
#Lastly, everything works similarly in DataFrames

dframe1 = DataFrame(np.random.randn(4,3), columns=['X', 'Y', 'Z'])
dframe2 = DataFrame(np.random.randn(3, 3), columns=['Y', 'Q', 'X'])

In [63]:
#Concat on DataFrame
pd.concat([dframe1,dframe2])

Unnamed: 0,Q,X,Y,Z
0,,-1.256687,-1.222438,-1.133118
1,,0.726479,-0.722988,0.212926
2,,0.151096,0.555974,-1.240276
3,,0.16545,1.277996,-0.254625
0,-0.516169,0.852616,-1.054367,
1,-1.209989,0.23153,-0.071854,
2,1.115575,-0.03999,0.72953,


In [64]:
#If we dont care about the index info and just awnt to make a complete DataFrame, just use ignore_index
pd.concat([dframe1,dframe2],ignore_index=True)

Unnamed: 0,Q,X,Y,Z
0,,-1.256687,-1.222438,-1.133118
1,,0.726479,-0.722988,0.212926
2,,0.151096,0.555974,-1.240276
3,,0.16545,1.277996,-0.254625
4,-0.516169,0.852616,-1.054367,
5,-1.209989,0.23153,-0.071854,
6,1.115575,-0.03999,0.72953,


In [65]:
#For more info in documentation:
url='http://pandas.pydata.org/pandas-docs/stable/generated/pandas.concat.html'

# Combining DataFrames

In [66]:
#Lets make some Series to work with

#First Series
ser1 = Series([2,np.nan,4,np.nan,6,np.nan],
           index=['Q','R','S','T','U','V'])

#Second Series (based off length of ser1)
ser2 = Series(np.arange(len(ser1), dtype=np.float64),
           index=['Q','R','S','T','U','V'])

ser2[-1] = np.nan

In [67]:
ser1

Q     2
R   NaN
S     4
T   NaN
U     6
V   NaN
dtype: float64

In [68]:
ser2

Q     0
R     1
S     2
T     3
U     4
V   NaN
dtype: float64

In [69]:
# Now let's get a series where the value of ser1 is chosen if ser2 is NAN,otherwise let the value be ser1
Series(np.where(pd.isnull(ser1),ser2,ser1),index=ser1.index)

Q     2
R     1
S     4
T     3
U     6
V   NaN
dtype: float64

In [70]:
#Now we can do the same thing simply by using combine_first with pandas
ser1.combine_first(ser2)

# 效果和上面的逻辑是一样的
#This combines the Series values, choosing the values of the calling Series first, unless its a NAN

Q     2
R     1
S     4
T     3
U     6
V   NaN
dtype: float64

In [72]:
# 对应dataframe来说：
#Lets make some 
dframe_odds = DataFrame({'X': [1., np.nan, 3., np.nan],
                     'Y': [np.nan, 5., np.nan, 7.],
                     'Z': [np.nan, 9., np.nan, 11.]})
dframe_evens = DataFrame({'X': [2., 4., np.nan, 6., 8.],
                     'Y': [np.nan, 10., 12., 14., 16.]})

In [73]:
#Show
dframe_odds

Unnamed: 0,X,Y,Z
0,1.0,,
1,,5.0,9.0
2,3.0,,
3,,7.0,11.0


In [74]:
#Show
dframe_evens

Unnamed: 0,X,Y
0,2.0,
1,4.0,10.0
2,,12.0
3,6.0,14.0
4,8.0,16.0


In [75]:
#Now lets combine using odds values first, unless theres a NAN, then put the evens values
dframe_odds.combine_first(dframe_evens)

Unnamed: 0,X,Y,Z
0,1,,
1,4,5.0,9.0
2,3,12.0,
3,6,7.0,11.0
4,8,16.0,


# Reshaping

In [76]:
#Let's see how stack and unstack work

# Create DataFrame
dframe1 = DataFrame(np.arange(8).reshape((2, 4)),
                 index=pd.Index(['LA', 'SF'], name='city'),
                 columns=pd.Index(['A', 'B', 'C','D'], name='letter'))
#Show
dframe1

letter,A,B,C,D
city,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
LA,0,1,2,3
SF,4,5,6,7


In [77]:
# Use stack to pivot the columns into the rows
dframe_st = dframe1.stack()

#Show
dframe_st

# 可以看左变成双index

city  letter
LA    A         0
      B         1
      C         2
      D         3
SF    A         4
      B         5
      C         6
      D         7
dtype: int32

In [78]:
#We can choose which level to unstack by
dframe_st.unstack(0)

# 0 表示第一列 city

city,LA,SF
letter,Unnamed: 1_level_1,Unnamed: 2_level_1
A,0,4
B,1,5
C,2,6
D,3,7


In [79]:
# Also by which name to unstack by
# 可以指定名称
dframe_st.unstack('letter')

letter,A,B,C,D
city,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
LA,0,1,2,3
SF,4,5,6,7


In [81]:
# Let's see how stack and unstack handle NAN

#Make two series
ser1 = Series([0, 1, 2], index=['Q', 'X', 'Y'])
ser2 = Series([4, 5, 6], index=['X', 'Y', 'Z'])

#Concat to make a dframe
dframe = pd.concat([ser1, ser2], keys=['Alpha', 'Beta'])

dframe

Alpha  Q    0
       X    1
       Y    2
Beta   X    4
       Y    5
       Z    6
dtype: int64

In [82]:
# Unstack resulting DataFrame
dframe.unstack()

Unnamed: 0,Q,X,Y,Z
Alpha,0.0,1,2,
Beta,,4,5,6.0


In [83]:
# Now stack will filter out NAN by default
dframe.unstack().stack()

Alpha  Q    0
       X    1
       Y    2
Beta   X    4
       Y    5
       Z    6
dtype: float64

In [84]:
# IF we dont want this we can set it to False
dframe.unstack().stack(dropna=False)

Alpha  Q     0
       X     1
       Y     2
       Z   NaN
Beta   Q   NaN
       X     4
       Y     5
       Z     6
dtype: float64

In [85]:
# 暂时未发现stack的作用

# Pivoting

In [86]:
# Lets create some data to play with:

# Note: It is not necessary to understand how this dataset was made to understand this Lecture.

#import pandas testing utility
import pandas.util.testing as tm; tm.N = 3

#Create a unpivoted function
def unpivot(frame):
    N, K = frame.shape
    
    data = {'value' : frame.values.ravel('F'),
            'variable' : np.asarray(frame.columns).repeat(N),
            'date' : np.tile(np.asarray(frame.index), K)}
    
    # Return the DataFrame
    return DataFrame(data, columns=['date', 'variable', 'value'])

#Set the DataFrame we'll be using
dframe = unpivot(tm.makeTimeDataFrame())

In [87]:
#Show the "stacked" data, note how there are multiple variables and values for the dates
dframe

Unnamed: 0,date,variable,value
0,2000-01-03,A,-0.276743
1,2000-01-04,A,-0.317848
2,2000-01-05,A,-1.516618
3,2000-01-03,B,-0.698157
4,2000-01-04,B,0.791282
5,2000-01-05,B,1.325268
6,2000-01-03,C,-0.86559
7,2000-01-04,C,0.764283
8,2000-01-05,C,-1.460543
9,2000-01-03,D,-0.117507


In [89]:
# Now let's pivot the data

# First two value spassed are teh row and column indexes, then finally an optional fill value
dframe_piv = dframe.pivot('date','variable','value')

#Show
dframe_piv

# 第一个参数指定index是哪个column，第二个是新的column

variable,A,B,C,D
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
2000-01-03,-0.276743,-0.698157,-0.86559,-0.117507
2000-01-04,-0.317848,0.791282,0.764283,1.390076
2000-01-05,-1.516618,1.325268,-1.460543,-0.387922


#  Duplicates in DataFrames

In [90]:
#Lets get a dataframe with duplicates

dframe = DataFrame({'key1': ['A'] * 2 + ['B'] * 3,
                  'key2': [2, 2, 2, 3, 3]})

#Show
dframe

Unnamed: 0,key1,key2
0,A,2
1,A,2
2,B,2
3,B,3
4,B,3


In [91]:
#We can use duplicated to find duplicates
dframe.duplicated()

0    False
1     True
2    False
3    False
4     True
dtype: bool

In [92]:
# We can also drop duplicates like this:
dframe.drop_duplicates()

Unnamed: 0,key1,key2
0,A,2
2,B,2
3,B,3


In [93]:
#You can filter which duplicates to drop by a single column
dframe.drop_duplicates(['key1'])

Unnamed: 0,key1,key2
0,A,2
2,B,2


In [94]:
#By default the first value was taken for the duplicates, we can also take the last value instead
dframe.drop_duplicates(['key1'],take_last=True)

Unnamed: 0,key1,key2
1,A,2
4,B,3


# Mapping

In [95]:
# Let's create a dframe to work with (Highest elevation cities in USA)
dframe = DataFrame({'city':['Alma','Brian Head','Fox Park'],
                    'altitude':[3158,3000,2762]})

#Show
dframe

Unnamed: 0,altitude,city
0,3158,Alma
1,3000,Brian Head
2,2762,Fox Park


In [96]:
#Now let's say we wanted to add a column for the States, we can do that with a mapping.
state_map={'Alma':'Colorado','Brian Head':'Utah','Fox Park':'Wyoming'}

In [97]:
# Now we can map that data to our current dframe
dframe['state'] = dframe['city'].map(state_map)

In [98]:
dframe

Unnamed: 0,altitude,city,state
0,3158,Alma,Colorado
1,3000,Brian Head,Utah
2,2762,Fox Park,Wyoming


# Replace

In [99]:
# Lets make  Series
ser1 = Series([1,2,3,4,1,2,3,4])
#Show
ser1

0    1
1    2
2    3
3    4
4    1
5    2
6    3
7    4
dtype: int64

In [100]:
# Using replace we can select --> .replace(value to be replaced, new_value)
ser1.replace(1,np.nan)

0   NaN
1     2
2     3
3     4
4   NaN
5     2
6     3
7     4
dtype: float64

In [101]:
#Can also input lists
ser1.replace([1,4],[100,400])

# 1 换成100， 4 换成 400
# 表示两个list的长度必须一致

0    100
1      2
2      3
3    400
4    100
5      2
6      3
7    400
dtype: int64

In [102]:
#Can also input dictionary
ser1.replace({4:np.nan})

# 使用字典的话就是值 哪个值换成哪个值

0     1
1     2
2     3
3   NaN
4     1
5     2
6     3
7   NaN
dtype: float64

# Rename Index

In [103]:
# Making a DataFrame
dframe= DataFrame(np.arange(12).reshape((3, 4)),
                 index=['NY', 'LA', 'SF'],
                 columns=['A', 'B', 'C', 'D'])

#Show
dframe

Unnamed: 0,A,B,C,D
NY,0,1,2,3
LA,4,5,6,7
SF,8,9,10,11


In [104]:
# Just like a Series, axis indexes can also use map

#Let's use map to lowercase the city initials
dframe.index.map(str.lower)

array(['ny', 'la', 'sf'], dtype=object)

In [105]:
# If you want to assign this to the actual index, you can use index
dframe.index = dframe.index.map(str.lower)
#Show
dframe

Unnamed: 0,A,B,C,D
ny,0,1,2,3
la,4,5,6,7
sf,8,9,10,11


In [106]:
# Use rename if you want to create a transformed version withour modifying the original!

#str.title will capitalize the first letter, lowercasing the columns
dframe.rename(index=str.title, columns=str.lower)

Unnamed: 0,a,b,c,d
Ny,0,1,2,3
La,4,5,6,7
Sf,8,9,10,11


In [107]:
# We can also use rename to insert dictionaries providing new values for indexes or columns!
dframe.rename(index={'ny': 'NEW YORK'},
            columns={'A': 'ALPHA'})

Unnamed: 0,ALPHA,B,C,D
NEW YORK,0,1,2,3
la,4,5,6,7
sf,8,9,10,11


In [108]:
# If you would like to actually edit the data set in place, set inplace=True
dframe.rename(index={'ny': 'NEW YORK'}, inplace=True)
dframe

Unnamed: 0,A,B,C,D
NEW YORK,0,1,2,3
la,4,5,6,7
sf,8,9,10,11


# Binning

In [109]:
years = [1990,1991,1992,2008,2012,2015,1987,1969,2013,2008,1999]

In [110]:
# We can seperate these years by decade
decade_bins = [1960,1970,1980,1990,2000,2010,2020]

In [111]:
#Now we'll use cut to get somethign called a Category object
# 神奇
decade_cat = pd.cut(years,decade_bins)

In [112]:
decade_cat

[(1980, 1990], (1990, 2000], (1990, 2000], (2000, 2010], (2010, 2020], ..., (1980, 1990], (1960, 1970], (2010, 2020], (2000, 2010], (1990, 2000]]
Length: 11
Categories (6, object): [(1960, 1970] < (1970, 1980] < (1980, 1990] < (1990, 2000] < (2000, 2010] < (2010, 2020]]

In [113]:
decade_cat.categories

Index([u'(1960, 1970]', u'(1970, 1980]', u'(1980, 1990]', u'(1990, 2000]',
       u'(2000, 2010]', u'(2010, 2020]'],
      dtype='object')

In [114]:
# Then we can check the value counts in each category
pd.value_counts(decade_cat)

(2010, 2020]    3
(1990, 2000]    3
(2000, 2010]    2
(1980, 1990]    2
(1960, 1970]    1
(1970, 1980]    0
dtype: int64

In [115]:
# We can also pass data values to the cut.

#For instance, if we just wanted to make two bins, evenly spaced based on max and min year, with a 1 year precision
pd.cut(years,2,precision=1)

[(1969, 1992], (1969, 1992], (1969, 1992], (1992, 2015], (1992, 2015], ..., (1969, 1992], (1969, 1992], (1992, 2015], (1992, 2015], (1992, 2015]]
Length: 11
Categories (2, object): [(1969, 1992] < (1992, 2015]]

# Outliers

In [116]:
# Let's see how we would find outliers in a dataset

# First we'll seed the numpy generator
np.random.seed(12345)

#Next we'll create the dataframe
dframe = DataFrame(np.random.randn(1000,4))

In [117]:
dframe.head()

Unnamed: 0,0,1,2,3
0,-0.204708,0.478943,-0.519439,-0.55573
1,1.965781,1.393406,0.092908,0.281746
2,0.769023,1.246435,1.007189,-1.296221
3,0.274992,0.228913,1.352917,0.886429
4,-2.001637,-0.371843,1.669025,-0.43857


In [118]:
# Lets describe the data
dframe.describe()

Unnamed: 0,0,1,2,3
count,1000.0,1000.0,1000.0,1000.0
mean,-0.067684,0.067924,0.025598,-0.002298
std,0.998035,0.992106,1.006835,0.996794
min,-3.428254,-3.548824,-3.184377,-3.745356
25%,-0.77489,-0.591841,-0.641675,-0.644144
50%,-0.116401,0.101143,0.002073,-0.013611
75%,0.616366,0.780282,0.680391,0.654328
max,3.366626,2.653656,3.260383,3.927528


In [119]:
# Lets select the first column
col = dframe[0]

In [120]:
# NOw we can check which values in the column are greater than 3, for instance.
col[np.abs(col)>3]

523   -3.428254
900    3.366626
Name: 0, dtype: float64

In [121]:
# So we now know in column[0], rows 523 and 900 have values with abs > 3

#How about all the columns?

# We can use the "any" method
dframe[(np.abs(dframe)>3).any(1)]

Unnamed: 0,0,1,2,3
5,-0.539741,0.476985,3.248944,-1.021228
97,-0.774363,0.552936,0.106061,3.927528
102,-0.655054,-0.56523,3.176873,0.959533
305,-2.315555,0.457246,-0.025907,-3.399312
324,0.050188,1.951312,3.260383,0.963301
400,0.146326,0.508391,-0.196713,-3.745356
499,-0.293333,-0.242459,-3.05699,1.918403
523,-3.428254,-0.296336,-0.439938,-0.867165
586,0.275144,1.179227,-3.184377,1.369891
808,-0.362528,-3.548824,1.553205,-2.186301


In [122]:
# WE could also possibly cap the data at 3

dframe[np.abs(dframe)>3] = np.sign(dframe) *3

In [123]:
dframe.describe()

Unnamed: 0,0,1,2,3
count,1000.0,1000.0,1000.0,1000.0
mean,-0.067623,0.068473,0.025153,-0.002081
std,0.995485,0.990253,1.003977,0.989736
min,-3.0,-3.0,-3.0,-3.0
25%,-0.77489,-0.591841,-0.641675,-0.644144
50%,-0.116401,0.101143,0.002073,-0.013611
75%,0.616366,0.780282,0.680391,0.654328
max,3.0,2.653656,3.0,3.0


# Permutation

In [124]:
# WE can randomly reorder (permutate) a Series, or the rows in a DataFrame

#Let's take a look
dframe = DataFrame(np.arange(4 * 4).reshape((4, 4)))

#Create an array with a random perumation of 0,1,2,3
blender = np.random.permutation(4)

blender

array([1, 3, 2, 0])

In [125]:
dframe

Unnamed: 0,0,1,2,3
0,0,1,2,3
1,4,5,6,7
2,8,9,10,11
3,12,13,14,15


In [None]:
# Now permutate the dframe based on the blender
dframe.take(blender)