#### Pandas library is designed on the basis of the NumPy library


In [1]:
import pandas as pd
import numpy as np

In [2]:
from pandas import *

#### The heart of pandas is the two primary data structures:
####   (i)Series
#####   (ii)Dataframes

# Series

# Series is used to represent one dimensional datastructure.
# ![image-2.png](attachment:image-2.png)

In [3]:
# Declaring a series
s=pd.Series([12,-4,7,9])
s

0    12
1    -4
2     7
3     9
dtype: int64

In [4]:
s = pd.Series([12,-4,7,9], index=['a','b','c','d'])
s

a    12
b    -4
c     7
d     9
dtype: int64

In [5]:
s.values

array([12, -4,  7,  9], dtype=int64)

In [6]:
s.index

Index(['a', 'b', 'c', 'd'], dtype='object')

In [7]:
#You can select individual elements as ordinary numpy arrays
print(s[2])

#Or you can specify the label corresponding to the position of the index.
print(s['b'])

7
-4


In [8]:
# accessing multiple items in the array
s[0:2]

a    12
b    -4
dtype: int64

In [9]:
#one can use the corresponding labels by specifying the list of labels.
s[['b','c']]

b   -4
c    7
dtype: int64

In [10]:
# Assigning values to the elements
s[1]=0
s

a    12
b     0
c     7
d     9
dtype: int64

In [11]:
s['b']=1
s

a    12
b     1
c     7
d     9
dtype: int64

In [12]:
#Defining a Series from NumPy Arrays 
arr = np.array([1,2,3,4])
s3 = pd.Series(arr)
s3

0    1
1    2
2    3
3    4
dtype: int32

In [13]:
s4=pd.Series(s)
s4

a    12
b     1
c     7
d     9
dtype: int64

In [14]:
s3

0    1
1    2
2    3
3    4
dtype: int32

In [15]:
arr[2]=-2
s3

0    1
1    2
2   -2
3    4
dtype: int32

In [16]:
s3[1]=1
arr

array([ 1,  1, -2,  4])

In [17]:
# Filtering values
s[s>8]

a    12
d     9
dtype: int64

In [18]:
#Operations and mathematical functions
'''operations such as operators (+, -, *, and /) and mathematical functions that are
    applicable to NumPy array can be extended to series.'''
print(s)
print(s/2)
print(s+2)

a    12
b     1
c     7
d     9
dtype: int64
a    6.0
b    0.5
c    3.5
d    4.5
dtype: float64
a    14
b     3
c     9
d    11
dtype: int64


In [19]:
np.log(s)

a    2.484907
b    0.000000
c    1.945910
d    2.197225
dtype: float64

In [20]:
#Evaluating values
s1 = pd.Series([1,0,2,1,2,3], index=['white','white','blue','green','green','yellow'])
s1

white     1
white     0
blue      2
green     1
green     2
yellow    3
dtype: int64

In [21]:
# 'unique()' function is used to know all the values contained in the series, excluding duplicates.
s1.unique()

array([1, 0, 2, 3], dtype=int64)

In [22]:
#value_counts(), which not only returns unique values but also calculates the occurrences within a series.
s1.value_counts()

1    2
2    2
0    1
3    1
dtype: int64

In [23]:
#isin() tells you if the values are contained in the data structure or not.
s1.isin([0,3])

white     False
white      True
blue      False
green     False
green     False
yellow     True
dtype: bool

In [25]:
s1[s1.isin([0,3])]

white     0
yellow    3
dtype: int64

In [27]:
ss=-3
np.log(ss)

  np.log(ss)


nan

In [28]:
#NaN -- Not a Number
s2 = pd.Series([5,-3,np.NaN,14])
s2

0     5.0
1    -3.0
2     NaN
3    14.0
dtype: float64

In [29]:
# The isnull() and notnull() functions are very useful to identify the indexes without a value.
s2.isnull()

0    False
1    False
2     True
3    False
dtype: bool

In [30]:
s2.notnull()

0     True
1     True
2    False
3     True
dtype: bool

In [31]:
 s2[s2.notnull()]

0     5.0
1    -3.0
3    14.0
dtype: float64

In [32]:
s2[s2.isnull()]

2   NaN
dtype: float64

In [33]:
# Series as Dictionaries
mydict = {'red': 2000, 'blue': 1000, 'yellow': 500,'orange': 1000}
mydict

{'red': 2000, 'blue': 1000, 'yellow': 500, 'orange': 1000}

In [35]:
myseries = pd.Series(mydict)
myseries

red       2000
blue      1000
yellow     500
orange    1000
dtype: int64

In [36]:
colors = ['red','yellow','orange','blue','green']
myseries = pd.Series(mydict, index=colors)
myseries

red       2000.0
yellow     500.0
orange    1000.0
blue      1000.0
green        NaN
dtype: float64

In [37]:
#Operations between series
mydict2 = {'red':400,'yellow':1000,'black':700}
myseries2 = pd.Series(mydict2)
myseries+myseries2

black        NaN
blue         NaN
green        NaN
orange       NaN
red       2400.0
yellow    1500.0
dtype: float64

# Data Frame
#### ![image.png](attachment:image.png)

#### The dataframe is a tabular data structure very similar to a spreadsheet. This data structure is designed 
#### to extend series to multiple dimensions. It has two indexes, one is row index and other is column index.

In [38]:
# Defining a Dataframe
data = {'color' : ['blue','green','yellow','red','white'],'object' : ['ball','pen','pencil','paper','mug'],
        'price' : [1.2,1.0,0.6,0.9,1.7]}
frame = pd.DataFrame(data)    

In [39]:
frame

Unnamed: 0,color,object,price
0,blue,ball,1.2
1,green,pen,1.0
2,yellow,pencil,0.6
3,red,paper,0.9
4,white,mug,1.7


In [40]:
# specifying sequence of columns using columns option
frame2 = pd.DataFrame(data, columns=['object','price'])
frame2

Unnamed: 0,object,price
0,ball,1.2
1,pen,1.0
2,pencil,0.6
3,paper,0.9
4,mug,1.7


In [41]:
#index atttribute is used to assign row labels
frame2 = pd.DataFrame(data, index=['one','two','three','four','five'])
frame2

Unnamed: 0,color,object,price
one,blue,ball,1.2
two,green,pen,1.0
three,yellow,pencil,0.6
four,red,paper,0.9
five,white,mug,1.7


In [42]:
frame3 = pd.DataFrame(np.arange(16).reshape((4,4)),index=['red','blue','yellow','white'],
                      columns=['ball','pen','pencil','paper'])
frame3

Unnamed: 0,ball,pen,pencil,paper
red,0,1,2,3
blue,4,5,6,7
yellow,8,9,10,11
white,12,13,14,15


In [43]:
# If you want to know all the columns in tha dataset one can use '.columns' function
frame3.columns

Index(['ball', 'pen', 'pencil', 'paper'], dtype='object')

In [44]:
# if you want to get all the index of rows one can use '.index' function
frame3.index

Index(['red', 'blue', 'yellow', 'white'], dtype='object')

In [45]:
frame.index

RangeIndex(start=0, stop=5, step=1)

In [46]:
# one can get the entire set of data contained within the data structure using the values attribute.
frame3.values

array([[ 0,  1,  2,  3],
       [ 4,  5,  6,  7],
       [ 8,  9, 10, 11],
       [12, 13, 14, 15]])

In [47]:
# selecting a column
frame3['ball']

red        0
blue       4
yellow     8
white     12
Name: ball, dtype: int32

In [48]:
frame['price']

0    1.2
1    1.0
2    0.6
3    0.9
4    1.7
Name: price, dtype: float64

In [49]:
frame.price

0    1.2
1    1.0
2    0.6
3    0.9
4    1.7
Name: price, dtype: float64

In [53]:
frame.loc[2]
# In the output names of the columns have become the label of the array index, 
#and the values have become the data of series.

color     yellow
object    pencil
price        0.6
Name: 2, dtype: object

In [56]:
frame.loc[[2,4]]

Unnamed: 0,color,object,price
2,yellow,pencil,0.6
4,white,mug,1.7


In [59]:
frame[0:1]

Unnamed: 0,color,object,price
0,blue,ball,1.2


In [60]:
frame[1:3]

Unnamed: 0,color,object,price
1,green,pen,1.0
2,yellow,pencil,0.6


In [61]:
# If you need a single value in the dataframe
frame['object'][3]

'paper'

In [65]:
# If you want to place a names to rows and columns you can assign like this
frame.index.name='id'
frame.columns.name='item'
frame

item,color,object,price
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
0,blue,ball,1.2
1,green,pen,1.0
2,yellow,pencil,0.6
3,red,paper,0.9
4,white,mug,1.7


In [67]:
#specifying a new column
frame['new']=12
frame

item,color,object,price,new
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
0,blue,ball,1.2,12
1,green,pen,1.0,12
2,yellow,pencil,0.6,12
3,red,paper,0.9,12
4,white,mug,1.7,12


In [68]:
#If you want to update the values of new column
frame['new']=[3.0,1.3,2.2,0.8,1.1]
frame

item,color,object,price,new
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
0,blue,ball,1.2,3.0
1,green,pen,1.0,1.3
2,yellow,pencil,0.6,2.2
3,red,paper,0.9,0.8
4,white,mug,1.7,1.1


In [69]:
ser = pd.Series(np.arange(5))
ser

0    0
1    1
2    2
3    3
4    4
dtype: int32

In [80]:
frame['new']=ser

In [71]:
frame


item,color,object,price,new
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
0,blue,ball,1.2,0
1,green,pen,1.0,1
2,yellow,pencil,0.6,2
3,red,paper,0.9,3
4,white,mug,1.7,4


In [73]:
frame['price'][2]=3.3

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  frame['price'][2]=3.3


In [74]:
frame

item,color,object,price,new
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
0,blue,ball,1.2,0
1,green,pen,1.0,1
2,yellow,pencil,3.3,2
3,red,paper,0.9,3
4,white,mug,1.7,4


In [75]:
frame.isin([1.0,'pen'])

item,color,object,price,new
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
0,False,False,False,False
1,False,True,True,True
2,False,False,False,False
3,False,False,False,False
4,False,False,False,False


In [76]:
frame[frame.isin([1.0,'pen'])]

item,color,object,price,new
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
0,,,,
1,,pen,1.0,1.0
2,,,,
3,,,,
4,,,,


In [81]:
#Deleting a column
del frame['new']
frame

item,color,object,price
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
0,blue,ball,1.2
1,green,pen,1.0
2,yellow,pencil,3.3
3,red,paper,0.9
4,white,mug,1.7


In [87]:
#Filtering
frame[frame['price']<1.2]

item,color,object,price
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
1,green,pen,1.0
3,red,paper,0.9


In [88]:
#Data Frame from nested dict\
nestdict = {'red':{2012: 22, 2013: 33},'white':{2011: 13, 2012: 22, 2013: 16},
            'blue': {2011: 17, 2012: 27, 2013: 18}}
frame2 = pd.DataFrame(nestdict)
frame2

Unnamed: 0,red,white,blue
2012,22.0,22,27
2013,33.0,16,18
2011,,13,17


In [89]:
#Transposing a dataframe
frame2.T

Unnamed: 0,2012,2013,2011
red,22.0,33.0,
white,22.0,16.0,13.0
blue,27.0,18.0,17.0


In [90]:
#Index Objects
ser = pd.Series([5,0,3,8,4], index=['red','blue','yellow','white','green'])
ser.index
#Index objects are immutable

Index(['red', 'blue', 'yellow', 'white', 'green'], dtype='object')

In [91]:
ser

red       5
blue      0
yellow    3
white     8
green     4
dtype: int64

In [94]:
#methods on index
#idxmin() returns the index with lowest value
#idxmax() returns the index with highest value
print('index value corresponding to lowest value: ',ser.idxmin())
print('index value corresponding to highest value: ',ser.idxmax())

index value corresponding to lowest value:  blue
index value corresponding to highest value:  white


In [95]:
#Index with duplicate labels
serd = pd.Series(range(6), index=['white','white','blue','green',
'green','yellow'])
serd

white     0
white     1
blue      2
green     3
green     4
yellow    5
dtype: int64

In [96]:
serd['white']

white    0
white    1
dtype: int64

In [98]:
print(serd.index.is_unique)
print(frame.index.is_unique)

False
True


In [104]:
#Reindexing
ser = pd.Series([2,5,7,4], index=['one','two','three','four'])
print(ser)
print('\n')
print(ser.reindex(['three','four','five','one']))

one      2
two      5
three    7
four     4
dtype: int64


three    7.0
four     4.0
five     NaN
one      2.0
dtype: float64


In [105]:
ser3=pd.Series([1,5,6,3],index=[0,3,5,6])
ser3

0    1
3    5
5    6
6    3
dtype: int64

In [107]:
# in the above the index is not sequence. For sequence we use method=ffill
ser3.reindex(range(6),method='ffill')

0    1
1    1
2    1
3    5
4    5
5    6
dtype: int64

In [111]:
#Setting a column name to the series column
ser3.name='numbers'
ser3

0    1
3    5
5    6
6    3
Name: numbers, dtype: int64

In [113]:
#reindexing dataframe
frame.reindex(range(5), method='ffill',columns=['colors','price','new',
'object']) 

item,colors,price,new,object
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
0,blue,1.2,blue,ball
1,green,1.0,green,pen
2,yellow,3.3,yellow,pencil
3,red,0.9,red,paper
4,white,1.7,white,mug


In [114]:
#Dropping
ser = pd.Series(np.arange(4.), index=['red','blue','yellow','white'])
ser

red       0.0
blue      1.0
yellow    2.0
white     3.0
dtype: float64

In [115]:
ser.drop('yellow')

red      0.0
blue     1.0
white    3.0
dtype: float64

In [116]:
ser.drop(['blue','white'])

red       0.0
yellow    2.0
dtype: float64

In [118]:
frame = pd.DataFrame(np.arange(16).reshape((4,4)), index=['red','blue','yellow','white'],
        columns=['ball','pen','pencil','paper'])
frame

Unnamed: 0,ball,pen,pencil,paper
red,0,1,2,3
blue,4,5,6,7
yellow,8,9,10,11
white,12,13,14,15


In [119]:
#To delete rows, you just pass the indexes of the rows.
frame.drop(['blue','yellow'])

Unnamed: 0,ball,pen,pencil,paper
red,0,1,2,3
white,12,13,14,15


In [121]:
# to delete columns you must specify axis=1
frame.drop(['pen','pencil'],axis=1)

Unnamed: 0,ball,paper
red,0,3
blue,4,7
yellow,8,11
white,12,15


#### Arithmetic and Data Alignment

In [122]:
# Adding two series
s1 = pd.Series([3,2,5,1],['white','yellow','green','blue'])
s2 = pd.Series([1,4,7,2,1],['white','yellow','black','blue','brown'])

In [123]:
s1+s2

black     NaN
blue      3.0
brown     NaN
green     NaN
white     4.0
yellow    6.0
dtype: float64

In [126]:
#Adding two dataframes
frame1 = pd.DataFrame(np.arange(16).reshape((4,4)),index=['red','blue','yellow','white'],
                      columns=['ball','pen','pencil','paper'])
frame2 = pd.DataFrame(np.arange(12).reshape((4,3)),index=['blue','green','white','yellow'],
                      columns=['mug','pen','ball'])
print(frame1)
print('\n')
print(frame2)
print('\n')
print(frame1+frame2)

        ball  pen  pencil  paper
red        0    1       2      3
blue       4    5       6      7
yellow     8    9      10     11
white     12   13      14     15


        mug  pen  ball
blue      0    1     2
green     3    4     5
white     6    7     8
yellow    9   10    11


        ball  mug  paper   pen  pencil
blue     6.0  NaN    NaN   6.0     NaN
green    NaN  NaN    NaN   NaN     NaN
red      NaN  NaN    NaN   NaN     NaN
white   20.0  NaN    NaN  20.0     NaN
yellow  19.0  NaN    NaN  19.0     NaN


#### Flexible arithmetic methods
   #####  add(), sub(), div(), mul()

In [128]:
#Instead of using frame1+frame2. you can use this
frame1.add(frame2)

Unnamed: 0,ball,mug,paper,pen,pencil
blue,6.0,,,6.0,
green,,,,,
red,,,,,
white,20.0,,,20.0,
yellow,19.0,,,19.0,


In [130]:
#Operations between dataframe and series
frame = pd.DataFrame(np.arange(16).reshape((4,4)),index=['red','blue','yellow','white'],
                     columns=['ball','pen','pencil','paper'])
frame

Unnamed: 0,ball,pen,pencil,paper
red,0,1,2,3
blue,4,5,6,7
yellow,8,9,10,11
white,12,13,14,15


In [131]:
ser=pd.Series(np.arange(4),index=['ball','pen','pencil','paper'])
ser

ball      0
pen       1
pencil    2
paper     3
dtype: int32

In [132]:
ser1=pd.Series(np.arange(4),index=frame.columns)
ser1

ball      0
pen       1
pencil    2
paper     3
dtype: int32

In [136]:
print(frame)
print('\n')
print(ser)
print('\n')
print(frame-ser)   #Values of the series are subtracted from the dataframe

        ball  pen  pencil  paper
red        0    1       2      3
blue       4    5       6      7
yellow     8    9      10     11
white     12   13      14     15


ball      0
pen       1
pencil    2
paper     3
dtype: int32


        ball  pen  pencil  paper
red        0    0       0      0
blue       4    4       4      4
yellow     8    8       8      8
white     12   12      12     12


In [137]:
ser['mug']=9
ser

ball      0
pen       1
pencil    2
paper     3
mug       9
dtype: int32

In [138]:
frame-ser   #mug column is added in series. So dataframe didn't have that column. so it displays NaN value.

Unnamed: 0,ball,mug,paper,pen,pencil
red,0,,0,0,0
blue,4,,4,4,4
yellow,8,,8,8,8
white,12,,12,12,12


#### Function Application and Mapping

In [139]:
#Functions by element
frame = pd.DataFrame(np.arange(16).reshape((4,4)),index=['red','blue','yellow','white'],
                     columns=['ball','pen','pencil','paper'])
frame

Unnamed: 0,ball,pen,pencil,paper
red,0,1,2,3
blue,4,5,6,7
yellow,8,9,10,11
white,12,13,14,15


In [140]:
np.sqrt(frame)     #square root function applying to each element

Unnamed: 0,ball,pen,pencil,paper
red,0.0,1.0,1.414214,1.732051
blue,2.0,2.236068,2.44949,2.645751
yellow,2.828427,3.0,3.162278,3.316625
white,3.464102,3.605551,3.741657,3.872983


In [144]:
#Functions by row or column
print(frame)
f=lambda x: x.max()-x.min()
frame.apply(f)   # This is like ball.max()-ball.min(), pen.max()-pen.min(), .....

        ball  pen  pencil  paper
red        0    1       2      3
blue       4    5       6      7
yellow     8    9      10     11
white     12   13      14     15


ball      12
pen       12
pencil    12
paper     12
dtype: int64

In [143]:
def f(x):
    return x.min()
frame.apply(f)

ball      0
pen       1
pencil    2
paper     3
dtype: int64

In [145]:
#applying same function along the columns
frame.apply(f,axis=1)   # This is like red.max()-red.min(), blue.max()-blue.min(),....

red       3
blue      3
yellow    3
white     3
dtype: int64

In [146]:
def f(x):
    return pd.Series([x.min(), x.max()], index=['min','max'])
frame.apply(f)

Unnamed: 0,ball,pen,pencil,paper
min,0,1,2,3
max,12,13,14,15


In [152]:
def f1(x):
    return pd.Series([x.mean(),x.max(),x.min(),x.sum()],index=['Avg','max','min','sum'])
print(frame)
print('\n')
print(frame.apply(f1))
print('\n')
print(frame.apply(f1,axis=1))

        ball  pen  pencil  paper
red        0    1       2      3
blue       4    5       6      7
yellow     8    9      10     11
white     12   13      14     15


     ball   pen  pencil  paper
Avg   6.0   7.0     8.0    9.0
max  12.0  13.0    14.0   15.0
min   0.0   1.0     2.0    3.0
sum  24.0  28.0    32.0   36.0


         Avg   max   min   sum
red      1.5   3.0   0.0   6.0
blue     5.5   7.0   4.0  22.0
yellow   9.5  11.0   8.0  38.0
white   13.5  15.0  12.0  54.0


#### Statistics functions

In [159]:
print(frame.sum())
print('\n')
print(frame.sum(axis=1))

ball      24
pen       28
pencil    32
paper     36
dtype: int64


red        6
blue      22
yellow    38
white     54
dtype: int64


In [160]:
print(frame.mean())
print('\n')
print(frame.mean(axis=1))

ball      6.0
pen       7.0
pencil    8.0
paper     9.0
dtype: float64


red        1.5
blue       5.5
yellow     9.5
white     13.5
dtype: float64


In [155]:
frame.describe()

Unnamed: 0,ball,pen,pencil,paper
count,4.0,4.0,4.0,4.0
mean,6.0,7.0,8.0,9.0
std,5.163978,5.163978,5.163978,5.163978
min,0.0,1.0,2.0,3.0
25%,3.0,4.0,5.0,6.0
50%,6.0,7.0,8.0,9.0
75%,9.0,10.0,11.0,12.0
max,12.0,13.0,14.0,15.0


#### Sorting and Ranking

In [161]:
ser = pd.Series([5,0,3,8,4],index=['red','blue','yellow','white','green'])
ser

red       5
blue      0
yellow    3
white     8
green     4
dtype: int64

In [165]:
print(ser.sort_index())    # sort index in a ascending order
print('\n')
print(ser.sort_index(ascending=False))   # It will sort index in a descending order

blue      0
green     4
red       5
white     8
yellow    3
dtype: int64


yellow    3
white     8
red       5
green     4
blue      0
dtype: int64


In [167]:
frame = pd.DataFrame(np.arange(16).reshape((4,4)),index=['red','blue','yellow','white'],
                     columns=['ball','pen','pencil','paper'])
print(frame)
print('\n')
print(frame.sort_index())
print('\n')
print(frame.sort_index(axis=1))

        ball  pen  pencil  paper
red        0    1       2      3
blue       4    5       6      7
yellow     8    9      10     11
white     12   13      14     15


        ball  pen  pencil  paper
blue       4    5       6      7
red        0    1       2      3
white     12   13      14     15
yellow     8    9      10     11


        ball  paper  pen  pencil
red        0      3    1       2
blue       4      7    5       6
yellow     8     11    9      10
white     12     15   13      14


In [169]:
print(ser)
print('\n')
print(ser.sort_values())


red       5
blue      0
yellow    3
white     8
green     4
dtype: int64


blue      0
yellow    3
green     4
red       5
white     8
dtype: int64


In [171]:
print(frame)
print('\n')
print(frame.sort_values(by='pen'))    #based on pen column the values are sorted

        ball  pen  pencil  paper
red        0    1       2      3
blue       4    5       6      7
yellow     8    9      10     11
white     12   13      14     15


        ball  pen  pencil  paper
red        0    1       2      3
blue       4    5       6      7
yellow     8    9      10     11
white     12   13      14     15


In [172]:
 frame.sort_values(by=['pen','pencil'])

Unnamed: 0,ball,pen,pencil,paper
red,0,1,2,3
blue,4,5,6,7
yellow,8,9,10,11
white,12,13,14,15


In [173]:
#rank(). The rank will be assigned starting from the lowest value to the highest.
ser.rank()

red       4.0
blue      1.0
yellow    2.0
white     5.0
green     3.0
dtype: float64

In [174]:
ser.rank(method='first')  #rank to the values are given in ascending order

red       4.0
blue      1.0
yellow    2.0
white     5.0
green     3.0
dtype: float64

In [175]:
ser.rank(ascending=False)   #rank is given in descending order.

red       2.0
blue      5.0
yellow    4.0
white     1.0
green     3.0
dtype: float64

#### Correlation and Covariance

In [176]:
#In pandas to calculate covariance and correlation by functions. corr() and cov().
seq2 = pd.Series([3,4,3,4,5,4,3,2],['2006','2007','2008','2009','2010','2011','2012','2013'])
seq = pd.Series([1,2,3,4,4,3,2,1],['2006','2007','2008','2009','2010','2011','2012','2013'])
print('Correlation: ',seq.corr(seq2))
print('Covariance: ',seq.cov(seq2))

Correlation:  0.7745966692414835
Covariance:  0.8571428571428571


In [180]:
frame2 = pd.DataFrame([[1,4,3,6],[4,5,6,1],[3,3,1,5],[4,1,6,4]],index=['red','blue','yellow','white'],
                      columns=['ball','pen','pencil','paper'])
print(frame2)
print('\n')
print(frame2.corr())
print('\n')
print(frame2.cov())

        ball  pen  pencil  paper
red        1    4       3      6
blue       4    5       6      1
yellow     3    3       1      5
white      4    1       6      4


            ball       pen    pencil     paper
ball    1.000000 -0.276026  0.577350 -0.763763
pen    -0.276026  1.000000 -0.079682 -0.361403
pencil  0.577350 -0.079682  1.000000 -0.692935
paper  -0.763763 -0.361403 -0.692935  1.000000


            ball       pen    pencil     paper
ball    2.000000 -0.666667  2.000000 -2.333333
pen    -0.666667  2.916667 -0.333333 -1.333333
pencil  2.000000 -0.333333  6.000000 -3.666667
paper  -2.333333 -1.333333 -3.666667  4.666667


In [185]:
'''Using the corrwith() method, you can calculate the pairwise correlations between
the columns or rows of a dataframe with a series or another DataFrame().'''
ser = pd.Series([0,1,2,3,9],index=['red','blue','yellow','white','green'])
print(ser)
print('\n')
print(frame2.corrwith(ser))      #correlation of one dataframe with series of data
print('\n')
print(frame2.corrwith(frame))

red       0
blue      1
yellow    2
white     3
green     9
dtype: int64


ball      0.730297
pen      -0.831522
pencil    0.210819
paper    -0.119523
dtype: float64


ball      0.730297
pen      -0.831522
pencil    0.210819
paper    -0.119523
dtype: float64


In [186]:
#Not a Number Data
ser = pd.Series([0,1,2,np.NaN,9],index=['red','blue','yellow','white','green'])
print(ser)
print('\n')
ser['white']=None
print(ser)

red       0.0
blue      1.0
yellow    2.0
white     NaN
green     9.0
dtype: float64


red       0.0
blue      1.0
yellow    2.0
white     NaN
green     9.0
dtype: float64


In [188]:
#Filtering NaN values
print(ser)
print('\n')
print(ser.dropna())

red       0.0
blue      1.0
yellow    2.0
white     NaN
green     9.0
dtype: float64


red       0.0
blue      1.0
yellow    2.0
green     9.0
dtype: float64


In [189]:
ser[ser.notnull()]

red       0.0
blue      1.0
yellow    2.0
green     9.0
dtype: float64

In [195]:
frame3 = pd.DataFrame([[6,np.nan,6],[np.nan,np.nan,np.nan],[2,np.nan,5]],index = ['blue','green','red'],
                      columns = ['ball','mug','pen'])
frame3

Unnamed: 0,ball,mug,pen
blue,6.0,,6.0
green,,,
red,2.0,,5.0


In [191]:
frame3.dropna() #it will delete entire row if one null value present

Unnamed: 0,ball,mug,pen


In [192]:
frame3.dropna(how='all')  #By specifying how =all. it deletes the row containing full null values

Unnamed: 0,ball,mug,pen
blue,6.0,,6.0
red,2.0,,5.0


In [197]:
#Filling in NaN values
frame3.fillna(0)

Unnamed: 0,ball,mug,pen
blue,6.0,0.0,6.0
green,0.0,0.0,0.0
red,2.0,0.0,5.0


In [198]:
frame3.fillna({'ball':1,'mug':0,'pen':99})

Unnamed: 0,ball,mug,pen
blue,6.0,0.0,6.0
green,1.0,0.0,99.0
red,2.0,0.0,5.0


#### Hierarchical index and Leveling

In [200]:
mser = pd.Series(np.random.rand(8),index=[['white','white','white','blue','blue','red','red','red'],
                                          ['up','down','right','up','down','up','down','left']])
mser

white  up       0.602807
       down     0.795326
       right    0.247240
blue   up       0.440312
       down     0.551458
red    up       0.601568
       down     0.288714
       left     0.994708
dtype: float64

In [201]:
mser.index

MultiIndex([('white',    'up'),
            ('white',  'down'),
            ('white', 'right'),
            ( 'blue',    'up'),
            ( 'blue',  'down'),
            (  'red',    'up'),
            (  'red',  'down'),
            (  'red',  'left')],
           )

In [204]:
# selecting the values for a given value of the first index
mser['white']

up       0.602807
down     0.795326
right    0.247240
dtype: float64

In [205]:
# selecting values for a given value of the second index.
mser[:,'up']

white    0.602807
blue     0.440312
red      0.601568
dtype: float64

In [214]:
# if you want to select a specific value, you specify both indexes.
mser['white','up']

0.6028065121913092

In [215]:
"""Hierarchical indexing plays a critical role in reshaping data and group-based
operations such as a pivot-table. For example, the data could be rearranged and used
in a dataframe with a special function called unstack(). This function converts the
series with a hierarchical index to a simple dataframe, where the second set of indexes is
converted into a new set of columns. """

mser.unstack()

Unnamed: 0,down,left,right,up
blue,0.551458,,,0.440312
red,0.288714,0.994708,,0.601568
white,0.795326,,0.24724,0.602807


In [217]:
#performing the reverse operation, used to convert a dataframe to a series, you use the stack() function.
print(frame)
print('\n')
print(frame.stack())

        ball  pen  pencil  paper
red        0    1       2      3
blue       4    5       6      7
yellow     8    9      10     11
white     12   13      14     15


red     ball       0
        pen        1
        pencil     2
        paper      3
blue    ball       4
        pen        5
        pencil     6
        paper      7
yellow  ball       8
        pen        9
        pencil    10
        paper     11
white   ball      12
        pen       13
        pencil    14
        paper     15
dtype: int32


In [218]:
mframe = pd.DataFrame(np.random.randn(16).reshape(4,4),index=[['white','white','red','red'], ['up','down','up','down']],
                      columns=[['pen','pen','paper','paper'],[1,2,1,2]])
mframe

Unnamed: 0_level_0,Unnamed: 1_level_0,pen,pen,paper,paper
Unnamed: 0_level_1,Unnamed: 1_level_1,1,2,1,2
white,up,-1.03525,0.200562,-0.826292,-0.914529
white,down,0.321935,-0.461226,0.025973,1.511283
red,up,0.434769,-0.608726,-0.518098,0.692281
red,down,1.03127,-0.360108,-1.776662,0.936875


#### Reordering and Sorting Levels

In [219]:
mframe.columns.names = ['objects','id']
mframe.index.names = ['colors','status']

In [220]:
mframe

Unnamed: 0_level_0,objects,pen,pen,paper,paper
Unnamed: 0_level_1,id,1,2,1,2
colors,status,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2
white,up,-1.03525,0.200562,-0.826292,-0.914529
white,down,0.321935,-0.461226,0.025973,1.511283
red,up,0.434769,-0.608726,-0.518098,0.692281
red,down,1.03127,-0.360108,-1.776662,0.936875


In [221]:
mframe.swaplevel('colors','status')

Unnamed: 0_level_0,objects,pen,pen,paper,paper
Unnamed: 0_level_1,id,1,2,1,2
status,colors,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2
up,white,-1.03525,0.200562,-0.826292,-0.914529
down,white,0.321935,-0.461226,0.025973,1.511283
up,red,0.434769,-0.608726,-0.518098,0.692281
down,red,1.03127,-0.360108,-1.776662,0.936875


In [222]:
mframe.sort_index(level='colors') 

Unnamed: 0_level_0,objects,pen,pen,paper,paper
Unnamed: 0_level_1,id,1,2,1,2
colors,status,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2
red,down,1.03127,-0.360108,-1.776662,0.936875
red,up,0.434769,-0.608726,-0.518098,0.692281
white,down,0.321935,-0.461226,0.025973,1.511283
white,up,-1.03525,0.200562,-0.826292,-0.914529
