# Introduction to Pandas Part 2

In [1]:
import pandas
pandas.__version__

'0.25.1'

In [2]:
import numpy as np
import pandas as pd

## The Pandas Index Object

In [3]:
#Construct an Index from a list of integers:

index = pd.Index([20, 30,40,50,60,70])
index

Int64Index([20, 30, 40, 50, 60, 70], dtype='int64')

### Index as immutable array

In [4]:
#We can use standard Python indexing notation to retrieve values or slices.

index[1]

30

In [5]:
index[::]

Int64Index([20, 30, 40, 50, 60, 70], dtype='int64')

In [6]:
#Index objects also have many of the attributes familiar from NumPy arrays.

print(index.size,index.shape,index.ndim,index.dtype)

6 (6,) 1 int64


In [7]:
#One difference between Index objects and NumPy arrays is that indices are immutable–that is, they cannot be modified via normal means.

index[1]=0

TypeError: Index does not support mutable operations

## Operating on Data in Pandas

### Ufuncs: Index Preservation

In [8]:
#Because Pandas is designed to work with NumPy, any NumPy ufunc will work on Pandas Series and DataFrame objects.

r=np.random.RandomState(15)
s=pd.Series(r.randint(0, 10, 4))
s

0    8
1    5
2    5
3    7
dtype: int32

In [9]:
d=pd.DataFrame(r.randint(0, 10, (5, 4)),
                  columns=['A', 'B', 'C', 'D'])
d

Unnamed: 0,A,B,C,D
0,0,7,5,6
1,1,7,0,4
2,9,7,5,3
3,6,8,2,1
4,1,0,5,2


In [10]:
#If we apply a NumPy ufunc on either of these objects, the result will be another Pandas object with the indices preserved.

np.exp(s)

0    2980.957987
1     148.413159
2     148.413159
3    1096.633158
dtype: float64

In [11]:
#A slightly more complex calculation.

np.sin(d*np.pi/4)

Unnamed: 0,A,B,C,D
0,0.0,-0.7071068,-0.707107,-1.0
1,0.707107,-0.7071068,0.0,1.224647e-16
2,0.707107,-0.7071068,-0.707107,0.7071068
3,-1.0,-2.449294e-16,1.0,0.7071068
4,0.707107,0.0,-0.707107,1.0


### Universal Functions: Index Alignment

In [12]:
#Index alignment in Series.

fruits=pd.Series({'apple':37, 'orange':62,
                  'grapes':67}, name='fruits')
cost=pd.Series({'chikku':21, 'kiwi':93,
                        'mango': 127}, name='cost')
print(fruits)
cost

apple     37
orange    62
grapes    67
Name: fruits, dtype: int64


chikku     21
kiwi       93
mango     127
Name: cost, dtype: int64

In [13]:
#When we divide these to compute the population density.

cost/fruits

apple    NaN
chikku   NaN
grapes   NaN
kiwi     NaN
mango    NaN
orange   NaN
dtype: float64

In [14]:
#The resulting array contains the union of indices of the two input arrays.

fruits.index|cost.index

Index(['apple', 'chikku', 'grapes', 'kiwi', 'mango', 'orange'], dtype='object')

In [15]:
c=pd.Series([2, 4, 6], index=[0, 1, 2])
d=pd.Series([1, 3, 5], index=[1, 2, 3])
print(c)
print(d)
d
c+d

0    2
1    4
2    6
dtype: int64
1    1
2    3
3    5
dtype: int64


0    NaN
1    5.0
2    9.0
3    NaN
dtype: float64

In [16]:
#If using NaN values is not the desired behavior, the fill value can be modified using appropriate object methods in place of the operators. 

c.add(d,fill_value=0)

0    2.0
1    5.0
2    9.0
3    5.0
dtype: float64

## Data wrangling

### Merge operations 

In [17]:
data={"fruits": ["mango", "apple", "grapes", "orange", "kiwi"],
        "Year": [2013, 2014, 2013, 2014, 2014],
        "cost": [25, 120, 35,45,65]}
print(data)
data = pd.DataFrame(data)
data

{'fruits': ['mango', 'apple', 'grapes', 'orange', 'kiwi'], 'Year': [2013, 2014, 2013, 2014, 2014], 'cost': [25, 120, 35, 45, 65]}


Unnamed: 0,fruits,Year,cost
0,mango,2013,25
1,apple,2014,120
2,grapes,2013,35
3,orange,2014,45
4,kiwi,2014,65


In [18]:
dp = pd.DataFrame(data, columns=["Year", "fruits" ,"cost"])
dp

Unnamed: 0,Year,fruits,cost
0,2013,mango,25
1,2014,apple,120
2,2013,grapes,35
3,2014,orange,45
4,2014,kiwi,65


In [19]:
dp['extra'] = dp.Year / dp.cost
dp

Unnamed: 0,Year,fruits,cost,extra
0,2013,mango,25,80.52
1,2014,apple,120,16.783333
2,2013,grapes,35,57.514286
3,2014,orange,45,44.755556
4,2014,kiwi,65,30.984615


In [20]:
dp['aligned'] = pd.Series(range(5), index=[0,1,2,3,4])
dp

Unnamed: 0,Year,fruits,cost,extra,aligned
0,2013,mango,25,80.52,0
1,2014,apple,120,16.783333,1
2,2013,grapes,35,57.514286,2
3,2014,orange,45,44.755556,3
4,2014,kiwi,65,30.984615,4


In [21]:
pd.DataFrame(dp.to_dict())

Unnamed: 0,Year,fruits,cost,extra,aligned
0,2013,mango,25,80.52,0
1,2014,apple,120,16.783333,1
2,2013,grapes,35,57.514286,2
3,2014,orange,45,44.755556,3
4,2014,kiwi,65,30.984615,4


In [22]:
##By merging we mean combining different data sets by linking rows with one or more keys. 

dp

Unnamed: 0,Year,fruits,cost,extra,aligned
0,2013,mango,25,80.52,0
1,2014,apple,120,16.783333,1
2,2013,grapes,35,57.514286,2
3,2014,orange,45,44.755556,3
4,2014,kiwi,65,30.984615,4


In [23]:
dp2 = pd.DataFrame({"fruits":["kiwi","orange","mango"],"rate":["100","200","300"]})
dp2

Unnamed: 0,fruits,rate
0,kiwi,100
1,orange,200
2,mango,300


In [24]:
## merge is smart! If there are overlapping names, it uses those for the merge.

dp.merge(dp2)

Unnamed: 0,Year,fruits,cost,extra,aligned,rate
0,2013,mango,25,80.52,0,300
1,2014,orange,45,44.755556,3,200
2,2014,kiwi,65,30.984615,4,100


In [25]:
#f the column names are different, you need to specify them explicitely.

dp3 = pd.DataFrame({"fruits":["mango","orange"],"cost":["100", "200"]})
dp3
dp.merge(dp3,right_on='fruits',left_on='fruits')

Unnamed: 0,Year,fruits,cost_x,extra,aligned,cost_y
0,2013,mango,25,80.52,0,100
1,2014,orange,45,44.755556,3,200


In [26]:
#By default merge does inner joins. If you want a different type of join, you can specify it.

dp4=pd.DataFrame({"fruits":["mango","orange"],"rate":["100", "200"]})
dp.merge(dp4,how='outer')

Unnamed: 0,Year,fruits,cost,extra,aligned,rate
0,2013,mango,25,80.52,0,100.0
1,2014,apple,120,16.783333,1,
2,2013,grapes,35,57.514286,2,
3,2014,orange,45,44.755556,3,200.0
4,2014,kiwi,65,30.984615,4,


### Combining data with overlap

In [27]:
#Sometimes some data is missing, and it can be "patched" with another dataset.

sa=pd.Series([np.nan, 2.5, np.nan, 3.5, 4.5, np.nan],
                     index=['f', 'e', 'd', 'c', 'b', 'a'])
sb=pd.Series(np.arange(len(sa), dtype=np.float64),
                 index=['f', 'e', 'd', 'c', 'b', 'a'])

In [28]:
sa

f    NaN
e    2.5
d    NaN
c    3.5
b    4.5
a    NaN
dtype: float64

In [29]:
sb

f    0.0
e    1.0
d    2.0
c    3.0
b    4.0
a    5.0
dtype: float64

In [30]:
#Let's say we want to update a with the values from b. The num-pythonic way to do that is

pd.Series(np.where(pd.isnull(sa),sb,sa), index=sa.index)

f    0.0
e    2.5
d    2.0
c    3.5
b    4.5
a    5.0
dtype: float64

In [31]:
sa.combine_first(sb)

f    0.0
e    2.5
d    2.0
c    3.5
b    4.5
a    5.0
dtype: float64

# The End