In [1]:
import pandas as pd

In [2]:
import numpy as np

In [3]:
s1 = pd.Series([7,5,4,6,7], index = ['a','b','c','d','e'])

In [4]:
s1

a    7
b    5
c    4
d    6
e    7
dtype: int64

In [5]:
s2 = pd.Series([1,43,4,6,7,8], index = ['a','c','e','f','g','h'])

In [6]:
s2

a     1
c    43
e     4
f     6
g     7
h     8
dtype: int64

In [7]:
s1 + s2

a     8.0
b     NaN
c    47.0
d     NaN
e    11.0
f     NaN
g     NaN
h     NaN
dtype: float64

In [8]:
### One of the most important pandas feature is the behaviour of arithmetic operations between objects with different indexes.
### When adding together objects, if any index pairs are not the same, the respective index in the result will be the union
### of the index pairs

In [9]:
### The internal data alignment introduces NA values in the indices that don't overlap. Missing values propogate in arithmetic
### computation


In [10]:
a = np.matrix('1 2 3 4; 5 6 7 8; 9 10 11 12; 13 14 15 16')
df1 = pd.DataFrame(a, index = ['Chennai', 'Bangalore', 'Kolkata', 'Pune'])

In [11]:
df1

Unnamed: 0,0,1,2,3
Chennai,1,2,3,4
Bangalore,5,6,7,8
Kolkata,9,10,11,12
Pune,13,14,15,16


In [12]:
df1.columns = ['2001','2002','2003','2004']

In [13]:
df1

Unnamed: 0,2001,2002,2003,2004
Chennai,1,2,3,4
Bangalore,5,6,7,8
Kolkata,9,10,11,12
Pune,13,14,15,16


In [14]:
a = np.matrix('2 24 3 4; 51 6 17 18; 69 10 61 121; 3 43 51 16')
df2 = pd.DataFrame(a, index = ['Chennai', 'Mumbai', 'Bangalore', 'Pune'], columns = ['2001','2002','2004','2005'])

In [15]:
df2

Unnamed: 0,2001,2002,2004,2005
Chennai,2,24,3,4
Mumbai,51,6,17,18
Bangalore,69,10,61,121
Pune,3,43,51,16


In [16]:
df1 + df2

Unnamed: 0,2001,2002,2003,2004,2005
Bangalore,74.0,16.0,,69.0,
Chennai,3.0,26.0,,7.0,
Kolkata,,,,,
Mumbai,,,,,
Pune,16.0,57.0,,67.0,


In [17]:
### In arithmetic operations between differently indexed objects, you might wanna fill with a special value like 0,
### when an axis label is found in one object but not in the other

In [18]:
### So try this

In [19]:
df1.add(df2, fill_value=0)

Unnamed: 0,2001,2002,2003,2004,2005
Bangalore,74.0,16.0,7.0,69.0,121.0
Chennai,3.0,26.0,3.0,7.0,4.0
Kolkata,9.0,10.0,11.0,12.0,
Mumbai,51.0,6.0,,17.0,18.0
Pune,16.0,57.0,15.0,67.0,16.0


In [20]:
### You can also reindex a Series or DataFrame and specify a different fill value

In [21]:
### So this is the reindexed df1

df1.reindex(columns=df2.columns, fill_value=0)

Unnamed: 0,2001,2002,2004,2005
Chennai,1,2,4,0
Bangalore,5,6,8,0
Kolkata,9,10,12,0
Pune,13,14,16,0


In [22]:
### And here is the original df1

df1

Unnamed: 0,2001,2002,2003,2004
Chennai,1,2,3,4
Bangalore,5,6,7,8
Kolkata,9,10,11,12
Pune,13,14,15,16


In [23]:
### In the reindexed one, the column have been reindexed accoring to the second data frame df2 and the values for the missing 
### column (2005), are filled with 0.
### The values for the other columns are retained from the original

In [24]:
### Now lets do arithmetic between a series and data frame

In [25]:
df1

Unnamed: 0,2001,2002,2003,2004
Chennai,1,2,3,4
Bangalore,5,6,7,8
Kolkata,9,10,11,12
Pune,13,14,15,16


In [26]:
s1

a    7
b    5
c    4
d    6
e    7
dtype: int64

In [27]:
df1+s1

Unnamed: 0,2001,2002,2003,2004,a,b,c,d,e
Chennai,,,,,,,,,
Bangalore,,,,,,,,,
Kolkata,,,,,,,,,
Pune,,,,,,,,,


In [28]:
s1.columns = ['2001','2002','2003','2004']

In [29]:
s1

a    7
b    5
c    4
d    6
e    7
dtype: int64

In [30]:
### First method of renaming the column labels didn't work. So lets try this.

s1.index = ['2001','2002','2003','2004','2005']

In [31]:
s1

2001    7
2002    5
2003    4
2004    6
2005    7
dtype: int64

In [32]:
df1+s1

Unnamed: 0,2001,2002,2003,2004,2005
Chennai,8,7,7,10,
Bangalore,12,11,11,14,
Kolkata,16,15,15,18,
Pune,20,19,19,22,


In [33]:
df1

Unnamed: 0,2001,2002,2003,2004
Chennai,1,2,3,4
Bangalore,5,6,7,8
Kolkata,9,10,11,12
Pune,13,14,15,16


In [34]:
s1

2001    7
2002    5
2003    4
2004    6
2005    7
dtype: int64

In [35]:
### Hence s1+df1 produces the sum of the two objects and the first row of the data frame is added to the series as
### the series is a one-dimensional sequence with column labels 2001,2002 and so on.,

In [36]:
series = df2.ix[0]

In [37]:
series

2001     2
2002    24
2004     3
2005     4
Name: Chennai, dtype: int32

In [38]:
df2

Unnamed: 0,2001,2002,2004,2005
Chennai,2,24,3,4
Mumbai,51,6,17,18
Bangalore,69,10,61,121
Pune,3,43,51,16


In [39]:
### The first row of the data frame is assigned to a series here.

In [40]:
### By default, an arithmetic operation between a DataFrame and Series matches the index of the Series on the DF's columns,
### broadcasting down the rows.

In [41]:
df2-series

Unnamed: 0,2001,2002,2004,2005
Chennai,0,0,0,0
Mumbai,49,-18,14,14
Bangalore,67,-14,58,117
Pune,1,19,48,12


In [42]:
### This is a perfect example of broadcasting. 
### To be simple and straight, the series object "series" is subtracted from every row of the data frame, matching the columns
### while doing so.

In [43]:
df2+series

Unnamed: 0,2001,2002,2004,2005
Chennai,4,48,6,8
Mumbai,53,30,20,22
Bangalore,71,34,64,125
Pune,5,67,54,20


In [44]:
series

2001     2
2002    24
2004     3
2005     4
Name: Chennai, dtype: int32

In [45]:
### If you, instead wanna broadcast over the columns, matching on the rows, we have to use one of the arithmetic methods as follows

In [56]:
df2

Unnamed: 0,2001,2002,2004,2005
Chennai,2,24,3,4
Mumbai,51,6,17,18
Bangalore,69,10,61,121
Pune,3,43,51,16


In [57]:
series1 = df2['2001']

In [58]:
series1

Chennai       2
Mumbai       51
Bangalore    69
Pune          3
Name: 2001, dtype: int32

In [59]:
df2.add(series1, axis = 0)

Unnamed: 0,2001,2002,2004,2005
Chennai,4,26,5,6
Mumbai,102,57,68,69
Bangalore,138,79,130,190
Pune,6,46,54,19


In [60]:
### In the above example, the "axis number" that you pass is the axis to match on and then the operation "Add" proceeds to
### broadcast across

### Lets try with another axis value

In [61]:
df2.add(series1, axis = 1)

Unnamed: 0,2001,2002,2004,2005,Bangalore,Chennai,Mumbai,Pune
Chennai,,,,,,,,
Mumbai,,,,,,,,
Bangalore,,,,,,,,
Pune,,,,,,,,


In [62]:
### Trying to comprehend the weird output??

### Try axis = 3

df2.add(series1, axis = 2)

ValueError: No axis named 2 for object type <class 'pandas.core.frame.DataFrame'>

In [63]:
### Weird right?

In [64]:
### Alright, the simple explanation is that, here the DF is 2 dimensional and axis 0 represents the column
### and axis 1 is the rows

### More explanation on this
### http://stackoverflow.com/questions/22149584/what-does-axis-in-pandas-mean


In [65]:
### Comprende????

In [66]:
### Well, lets move on to the next step

In [67]:
df3 = df2 - series

In [68]:
df3

Unnamed: 0,2001,2002,2004,2005
Chennai,0,0,0,0
Mumbai,49,-18,14,14
Bangalore,67,-14,58,117
Pune,1,19,48,12


In [69]:
### Lets try some element-wise array methods

In [70]:
np.abs(df3)

Unnamed: 0,2001,2002,2004,2005
Chennai,0,0,0,0
Mumbai,49,18,14,14
Bangalore,67,14,58,117
Pune,1,19,48,12


In [71]:
### The above 'abs' method is used to calculate the absolute value element-wise.

In [72]:
### Lets apply a function on 1D arrays to each row or column. We could use DataFrame's "apply" method

In [73]:
f = lambda x: x.max() - x.min()

In [74]:
df3.apply(f)

2001     67
2002     37
2004     58
2005    117
dtype: int64

In [75]:
### By default the function is applied to the axis 0 which represents the columns
### So in each column, the element x represents the difference between the max in that column minus the minimum in the same
### So for column 2, the max number is 19 and the min element is -18 and their difference is 37. 

In [76]:
### Many of the most common array statistics(like sum and mean) are DataFrame methods. SO using "apply" is not necessary.

In [77]:
### The function passed to "apply" need not return a scalar value. It can also return a Series with multiple values.

In [78]:
def f(x):
    return pd.Series([x.min(), x.max()], index = ['min','max'])

In [79]:
df3.apply(f)

Unnamed: 0,2001,2002,2004,2005
min,0,-18,0,0
max,67,19,58,117


In [80]:
### The above function simply returns the min and max values in the respective columns.

In [86]:
format = lambda x: '%.3f' %x

In [87]:
### The above lambda function formats each element of the object (Series or DataFrame) into a floating point number with 
### 3 decimal points. 

### Series has a map function that can be used for applying element-wise function
df3['2002'].map(format)

Chennai        0.000
Mumbai       -18.000
Bangalore    -14.000
Pune          19.000
Name: 2002, dtype: object

In [88]:
### Here is the original DataFrame

df3

Unnamed: 0,2001,2002,2004,2005
Chennai,0,0,0,0
Mumbai,49,-18,14,14
Bangalore,67,-14,58,117
Pune,1,19,48,12


In [89]:
### appplymap can be used to apply the function on the entire DF

df3.applymap(format)

Unnamed: 0,2001,2002,2004,2005
Chennai,0.0,0.0,0.0,0.0
Mumbai,49.0,-18.0,14.0,14.0
Bangalore,67.0,-14.0,58.0,117.0
Pune,1.0,19.0,48.0,12.0


In [None]:
### For more info on lambda functions, refer http://www.secnetix.de/olli/Python/lambda_functions.hawk