# Introduction to Pandas
## Installing and using Pandas

In [1]:
import pandas as pd

In [6]:
!pip install pandas



You should consider upgrading via the 'c:\python\python391\python.exe -m pip install --upgrade pip' command.
You should consider upgrading via the 'c:\python\python391\python.exe -m pip install --upgrade pip' command.






## Introducing pandas objects
## Series

In [2]:
counts=pd.Series([501,1000,1200,109])
counts

0     501
1    1000
2    1200
3     109
dtype: int64

In [3]:
counts.values

array([ 501, 1000, 1200,  109], dtype=int64)

In [4]:
counts.index

RangeIndex(start=0, stop=4, step=1)

In [5]:
animals=pd.Series([501,1000,1200,109],
                 index=['Cat','Dog','Tiger','Lion'])

In [6]:
animals

Cat       501
Dog      1000
Tiger    1200
Lion      109
dtype: int64

In [7]:
animals['Tiger']

1200

In [8]:
animals[1]

1000

In [9]:
animals.name='counts'
animals.index.name='Elephant'

In [10]:
animals

Elephant
Cat       501
Dog      1000
Tiger    1200
Lion      109
Name: counts, dtype: int64

In [11]:
import numpy as np
np.log(animals)

Elephant
Cat      6.216606
Dog      6.907755
Tiger    7.090077
Lion     4.691348
Name: counts, dtype: float64

In [12]:
animals[animals>1000]

Elephant
Tiger    1200
Name: counts, dtype: int64

In [13]:
animals_dict={'Cat':501,'Dog':1000,'Tiger':1200,'Lion':109}
print(animals_dict)
pd.Series(animals_dict)

{'Cat': 501, 'Dog': 1000, 'Tiger': 1200, 'Lion': 109}


Cat       501
Dog      1000
Tiger    1200
Lion      109
dtype: int64

## DataFrame:bi-dimensional series with two(or more) indices 

In [14]:
data={"Province":["FL","FL","NH","NH","ZH"],
     "Year":[2003,2001,2003,2001,2001],
     "Literacy":[0.1,0.2,0.4,0.3,0.4]}

In [15]:
print(data)

{'Province': ['FL', 'FL', 'NH', 'NH', 'ZH'], 'Year': [2003, 2001, 2003, 2001, 2001], 'Literacy': [0.1, 0.2, 0.4, 0.3, 0.4]}


In [16]:
data:pd.DataFrame(data)
data

{'Province': ['FL', 'FL', 'NH', 'NH', 'ZH'],
 'Year': [2003, 2001, 2003, 2001, 2001],
 'Literacy': [0.1, 0.2, 0.4, 0.3, 0.4]}

## To change order of columns 

In [17]:
df=pd.DataFrame(data,columns=["Year","Province","Literacy"])

In [18]:
df

Unnamed: 0,Year,Province,Literacy
0,2003,FL,0.1
1,2001,FL,0.2
2,2003,NH,0.4
3,2001,NH,0.3
4,2001,ZH,0.4


In [19]:
df['shri']=df.Year/df.Literacy

In [20]:
df

Unnamed: 0,Year,Province,Literacy,shri
0,2003,FL,0.1,20030.0
1,2001,FL,0.2,10005.0
2,2003,NH,0.4,5007.5
3,2001,NH,0.3,6670.0
4,2001,ZH,0.4,5002.5


In [21]:
df['Series_aligned']=pd.Series(range(5),index=[0,1,2,3,4])

In [22]:
df

Unnamed: 0,Year,Province,Literacy,shri,Series_aligned
0,2003,FL,0.1,20030.0,0
1,2001,FL,0.2,10005.0,1
2,2003,NH,0.4,5007.5,2
3,2001,NH,0.3,6670.0,3
4,2001,ZH,0.4,5002.5,4


In [23]:
df.to_dict()

{'Year': {0: 2003, 1: 2001, 2: 2003, 3: 2001, 4: 2001},
 'Province': {0: 'FL', 1: 'FL', 2: 'NH', 3: 'NH', 4: 'ZH'},
 'Literacy': {0: 0.1, 1: 0.2, 2: 0.4, 3: 0.3, 4: 0.4},
 'shri': {0: 20030.0, 1: 10005.0, 2: 5007.5, 3: 6670.0, 4: 5002.5},
 'Series_aligned': {0: 0, 1: 1, 2: 2, 3: 3, 4: 4}}

In [24]:
pd.DataFrame(df.to_dict())

Unnamed: 0,Year,Province,Literacy,shri,Series_aligned
0,2003,FL,0.1,20030.0,0
1,2001,FL,0.2,10005.0,1
2,2003,NH,0.4,5007.5,2
3,2001,NH,0.3,6670.0,3
4,2001,ZH,0.4,5002.5,4


## DataFrame as specialized dictionary
## From a list of dicts

In [25]:
data=[{'a':i,'b':20*i}for i in range(6)]

In [26]:
print(data)

[{'a': 0, 'b': 0}, {'a': 1, 'b': 20}, {'a': 2, 'b': 40}, {'a': 3, 'b': 60}, {'a': 4, 'b': 80}, {'a': 5, 'b': 100}]


In [27]:
pd.DataFrame(data)

Unnamed: 0,a,b
0,0,0
1,1,20
2,2,40
3,3,60
4,4,80
5,5,100


In [28]:
pd.DataFrame([{'ss':3,'hh':2},{'hh':4,'ii':6}])

Unnamed: 0,ss,hh,ii
0,3.0,2,
1,,4,6.0


In [29]:
df=pd.DataFrame([{'ss':1,'hh':6},{'hh':5,'ii':4}])

In [30]:
df=df.to_dict()
df

{'ss': {0: 1.0, 1: nan}, 'hh': {0: 6, 1: 5}, 'ii': {0: nan, 1: 4.0}}

## From a two-dimensional Numpy array 

In [31]:
pd.DataFrame(np.random.randint(2,20),
            columns=['foo','bar'],
            index=['a','b','c'])

Unnamed: 0,foo,bar
a,12,12
b,12,12
c,12,12


## The Pandas index object 

In [32]:
shri=pd.Index([20,19,17,37,21,18])
shri

Int64Index([20, 19, 17, 37, 21, 18], dtype='int64')

## Index as immutable array 

In [33]:
shri[1]

19

In [34]:
shri[::]

Int64Index([20, 19, 17, 37, 21, 18], dtype='int64')

In [35]:
print(shri.size,shri.shape,shri.ndim,shri.dtype)

6 (6,) 1 int64


In [36]:
shri[1]= 0

TypeError: Index does not support mutable operations

## Operating on data in pandas
## ufuns:index Preservation

In [37]:
rng=np.random.RandomState(20)
rakshi=pd.Series(rng.randint(0,20,6))
rakshi

0     3
1    15
2     9
3    11
4     7
5     2
dtype: int32

In [38]:
dfr=pd.DataFrame(rng.randint(1,20,(5,4)),columns=['W','X','Y','Z'])

In [39]:
dfr

Unnamed: 0,W,X,Y,Z
0,1,9,17,7
1,7,17,10,6
2,8,6,3,7
3,14,12,4,11
4,12,14,15,1


In [40]:
np.exp(rakshi)

0    2.008554e+01
1    3.269017e+06
2    8.103084e+03
3    5.987414e+04
4    1.096633e+03
5    7.389056e+00
dtype: float64

In [41]:
np.sin(dfr*np.pi/4)

Unnamed: 0,W,X,Y,Z
0,0.7071068,0.7071068,0.7071068,-0.707107
1,-0.7071068,0.7071068,1.0,-1.0
2,-2.449294e-16,-1.0,0.7071068,-0.707107
3,-1.0,3.67394e-16,1.224647e-16,0.707107
4,3.67394e-16,-1.0,-0.7071068,0.707107


## Index aligment in Series 

In [42]:
area=pd.Series({'Manglore':2732228,'Banglore':783893,'Kodagu':538528},name='area')
population=pd.Series({'Manglore':37443631,'Banglore':37559284,'udupi':28522383},name='population')

In [43]:
print(area)
population

Manglore    2732228
Banglore     783893
Kodagu       538528
Name: area, dtype: int64


Manglore    37443631
Banglore    37559284
udupi       28522383
Name: population, dtype: int64

In [44]:
population/area

Banglore    47.913789
Kodagu            NaN
Manglore    13.704431
udupi             NaN
dtype: float64

In [45]:
area.index|population.index

  area.index|population.index


Index(['Banglore', 'Kodagu', 'Manglore', 'udupi'], dtype='object')

In [46]:
s=pd.Series([1,2,3],index=[0,5,6])
h=pd.Series([2,4,5],index=[7,5,0])

In [47]:
print(s)
print(h)

0    1
5    2
6    3
dtype: int64
7    2
5    4
0    5
dtype: int64


In [48]:
h
s+h

0    6.0
5    6.0
6    NaN
7    NaN
dtype: float64

In [49]:
s.add(h,fill_value=0)

0    6.0
5    6.0
6    3.0
7    2.0
dtype: float64

# Ufuncs:Operations Between DataFrame and Series
## Merge Operation

In [50]:
print(df)

{'ss': {0: 1.0, 1: nan}, 'hh': {0: 6, 1: 5}, 'ii': {0: nan, 1: 4.0}}


In [51]:
df2=pd.DataFrame({"Province":["FL","NH","ZH"],"Population":["100000","200000","300000"]})
df2

Unnamed: 0,Province,Population
0,FL,100000
1,NH,200000
2,ZH,300000


In [52]:
#merge is smart!If there are overlapping names,it uses those for the merge
df.merge(df2)

AttributeError: 'dict' object has no attribute 'merge'

In [53]:
df3=pd.DataFrame({"Province":["FL","NH"],"Population":["100000","200000"]})
df3

Unnamed: 0,Province,Population
0,FL,100000
1,NH,200000


In [54]:
df.merge(df3,right_on='province',left_on='province')

AttributeError: 'dict' object has no attribute 'merge'

In [55]:
df.merge(df4,how='outer')

AttributeError: 'dict' object has no attribute 'merge'

In [56]:
df5=pd.DataFrame({"province":["100000","200000","500000"]})
print(df5)

  province
0   100000
1   200000
2   500000


## Combining data with overlap

In [57]:
Serie_a=pd.Series([np.NaN,2.5,np.NaN,3.5,4.5,np.NaN],index=['q','w','e','r','t','y'])

In [58]:
Serie_b=pd.Series(np.arange(len(Serie_a),dtype=np.float64),index=['q','w','e','r','t','y'])

In [59]:
Serie_a

q    NaN
w    2.5
e    NaN
r    3.5
t    4.5
y    NaN
dtype: float64

In [60]:
Serie_b

q    0.0
w    1.0
e    2.0
r    3.0
t    4.0
y    5.0
dtype: float64

In [61]:
Serie_a.combine_first(Serie_b)

q    0.0
w    2.5
e    2.0
r    3.5
t    4.5
y    5.0
dtype: float64