Pandas - Similar to a Numpy array built on top of a Numpy array used for Data exploration and manipulation

Series is a one-dimensional labeled array capable of holding data of any type (integer, string, float, python objects, etc.). The axis labels are collectively called index.

In [3]:
# To create python objects - list, array, numpy array and a dictionary

import numpy as np
import pandas as pd

label = ['a','b','c']

my_data = [10,20,30]

arr = np.array(my_data)

d = {'a':10,'b':20,'c':30}

In [4]:
my_data

[10, 20, 30]

In [5]:
 # To create a Series
# General Form ~ pd.Series(data= , index=, dtype=, name=, copy=False, fastpath=False)
    
pd.Series(data=my_data)

0    10
1    20
2    30
dtype: int64

In [6]:
# A series takes an index and we can specify the index

pd.Series(data=my_data, index=label) 

a    10
b    20
c    30
dtype: int64

In [7]:
#Since data and index are in order we can skip them and mention them in order

pd.Series(my_data,label)

a    10
b    20
c    30
dtype: int64

In [8]:
pd.Series(arr)

0    10
1    20
2    30
dtype: int32

In [9]:
# For a dictionary the key is the label and the value is the data

pd.Series(d)

a    10
b    20
c    30
dtype: int64

In [10]:
pd.Series(label)

0    a
1    b
2    c
dtype: object

In [11]:
Ser1 = pd.Series([1,2,3,4],['USA','Germany','USSR','Japan'])

In [12]:
Ser2 = pd.Series([1,2,5,4],['USA','Germany','Italy','Japan'])

In [13]:
Ser1

USA        1
Germany    2
USSR       3
Japan      4
dtype: int64

In [14]:
Ser2

USA        1
Germany    2
Italy      5
Japan      4
dtype: int64

In [19]:
# We can grab information from the Series using the index

Ser2['Italy']

5

In [15]:
# During an operation python converts the integers to Float in order to retain all the info
 
Ser1 + Ser2

Germany    4.0
Italy      NaN
Japan      8.0
USA        2.0
USSR       NaN
dtype: float64

In [17]:
Ser3 = pd.Series(label)

In [18]:
# The strings are represented as objects in a Panda Series

Ser3

0    a
1    b
2    c
dtype: object

DataFrames

A DataFrame is a combination of Panda Series which share a common index

In [34]:
# To create a dataframe

# General Form ~ pd.DataFrame(data=, index=, Columns=, dtype=, copy=)

import numpy as np
import pandas as pd

from numpy.random import randn

In [35]:
# Seed is used to generate same numbers

np.random.seed(101)

In [36]:
df = pd.DataFrame(randn(5,4),['A','B','C','D','E'],['W','X','Y','Z'])

In [37]:
df

Unnamed: 0,W,X,Y,Z
A,2.70685,0.628133,0.907969,0.503826
B,0.651118,-0.319318,-0.848077,0.605965
C,-2.018168,0.740122,0.528813,-0.589001
D,0.188695,-0.758872,-0.933237,0.955057
E,0.190794,1.978757,2.605967,0.683509


In [38]:
#A DataFrame is a combination of Panda Series which share a common index

# To grab/select a Series or column from a dataframe
df['W']

A    2.706850
B    0.651118
C   -2.018168
D    0.188695
E    0.190794
Name: W, dtype: float64

In [39]:
type(df['W'])

pandas.core.series.Series

In [40]:
type(df)

pandas.core.frame.DataFrame

In [42]:
# To grab a Series or column name we can also use the column name. This might replace a method and it is confusing if its a
# column name or a method so this is not recommended

df.W

A    2.706850
B    0.651118
C   -2.018168
D    0.188695
E    0.190794
Name: W, dtype: float64

In [43]:
# To select multiple columns

df[['W','Z']]

Unnamed: 0,W,Z
A,2.70685,0.503826
B,0.651118,0.605965
C,-2.018168,-0.589001
D,0.188695,0.955057
E,0.190794,0.683509


In [44]:
# To add a new column to a DataFrame

df['new'] = df['W'] + df['Z']

In [45]:
df

Unnamed: 0,W,X,Y,Z,new
A,2.70685,0.628133,0.907969,0.503826,3.210676
B,0.651118,-0.319318,-0.848077,0.605965,1.257083
C,-2.018168,0.740122,0.528813,-0.589001,-2.607169
D,0.188695,-0.758872,-0.933237,0.955057,1.143752
E,0.190794,1.978757,2.605967,0.683509,0.874303


In [46]:
# To drop a column we have to give the axis =1. If we dont mention the axis it will search for the columm name in index 
# and throw a ValueError

df.drop('new',axis=1)

Unnamed: 0,W,X,Y,Z
A,2.70685,0.628133,0.907969,0.503826
B,0.651118,-0.319318,-0.848077,0.605965
C,-2.018168,0.740122,0.528813,-0.589001
D,0.188695,-0.758872,-0.933237,0.955057
E,0.190794,1.978757,2.605967,0.683509


In [47]:
df

Unnamed: 0,W,X,Y,Z,new
A,2.70685,0.628133,0.907969,0.503826,3.210676
B,0.651118,-0.319318,-0.848077,0.605965,1.257083
C,-2.018168,0.740122,0.528813,-0.589001,-2.607169
D,0.188695,-0.758872,-0.933237,0.955057,1.143752
E,0.190794,1.978757,2.605967,0.683509,0.874303


In [48]:
# The new column is not dropped from the original dataframe. In order to remove from the dataframe we need to use
# inplace=True option. This will allow us not to loose info accidentally.

df.drop('new',axis=1,inplace=True)

In [100]:
df

Unnamed: 0,W,X,Y,Z,States
A,2.70685,0.628133,0.907969,0.503826,CA
B,0.651118,-0.319318,-0.848077,0.605965,OH
C,-2.018168,0.740122,0.528813,-0.589001,PA
D,0.188695,-0.758872,-0.933237,0.955057,AZ
E,0.190794,1.978757,2.605967,0.683509,NY


In [51]:
# This can also be written as df.drop('new,axis=0) by default axis =0

df.drop('E')

Unnamed: 0,W,X,Y,Z
A,2.70685,0.628133,0.907969,0.503826
B,0.651118,-0.319318,-0.848077,0.605965
C,-2.018168,0.740122,0.528813,-0.589001
D,0.188695,-0.758872,-0.933237,0.955057


In [53]:
# The matrix is represented as a Tuple with the rows as 5 with index as 0 and columns are 4 with index 1
df.shape

(5, 4)

In [98]:
# To select rows in a dataframe. 
#We use loc(label based index) or iloc(numerical based index) methods and rows are also retured as Series

df.loc['C']

W         -2.01817
X         0.740122
Y         0.528813
Z        -0.589001
States          PA
Name: C, dtype: object

In [99]:
df.iloc[2]

W         -2.01817
X         0.740122
Y         0.528813
Z        -0.589001
States          PA
Name: C, dtype: object

In [59]:
# Slicing 

df.loc['B','Y']

-0.84807698340363147

In [60]:
df.loc[['A','D'],['W','Z']]

Unnamed: 0,W,Z
A,2.70685,0.503826
D,0.188695,0.955057


Conditional Selection

In [61]:
df > 0

Unnamed: 0,W,X,Y,Z
A,True,True,True,True
B,True,False,False,True
C,False,True,True,False
D,True,False,False,True
E,True,True,True,True


In [62]:
bool_df = df>0

In [63]:
bool_df

Unnamed: 0,W,X,Y,Z
A,True,True,True,True
B,True,False,False,True
C,False,True,True,False
D,True,False,False,True
E,True,True,True,True


In [64]:
# All the values grater than 0 are returned and the values less than 0 are returned as NaNs

df[df>0]

Unnamed: 0,W,X,Y,Z
A,2.70685,0.628133,0.907969,0.503826
B,0.651118,,,0.605965
C,,0.740122,0.528813,
D,0.188695,,,0.955057
E,0.190794,1.978757,2.605967,0.683509


In [70]:
df['W']>0 

A     True
B     True
C    False
D     True
E     True
Name: W, dtype: bool

In [74]:
# While doing conditional operations based on a column and we wrap in the dataframe we will not get the values 
# which are not satisfying the condition. Row C which did not satisfy the condition is not returned.

df[df['W']>0]

Unnamed: 0,W,X,Y,Z
A,2.70685,0.628133,0.907969,0.503826
B,0.651118,-0.319318,-0.848077,0.605965
D,0.188695,-0.758872,-0.933237,0.955057
E,0.190794,1.978757,2.605967,0.683509


In [75]:
# Based on the conditional statement we get a subset on which we can perform operations in one or 2 steps(by saving in another dataframe)

resultdf= df[df['W']>0]

In [76]:
resultdf

Unnamed: 0,W,X,Y,Z
A,2.70685,0.628133,0.907969,0.503826
B,0.651118,-0.319318,-0.848077,0.605965
D,0.188695,-0.758872,-0.933237,0.955057
E,0.190794,1.978757,2.605967,0.683509


In [77]:
resultdf['X']

A    0.628133
B   -0.319318
D   -0.758872
E    1.978757
Name: X, dtype: float64

In [79]:
# In a single step

df[df['W']>0]['X']

A    0.628133
B   -0.319318
D   -0.758872
E    1.978757
Name: X, dtype: float64

In [81]:
# We can also call a list of columns from the dataframe. This reduces the space usage

df[df['W']>0][['X','Y']]

Unnamed: 0,X,Y
A,0.628133,0.907969
B,-0.319318,-0.848077
D,-0.758872,-0.933237
E,1.978757,2.605967


Multiple conditional statements

In [85]:
# We have to use & not an 'and' because pandas can only deal with single boolean values not multiple
# AND = & and OR = |

df[(df['X']<1) & (df['Y']<0)]

Unnamed: 0,W,X,Y,Z
B,0.651118,-0.319318,-0.848077,0.605965
D,0.188695,-0.758872,-0.933237,0.955057


Resetting the Index

In [86]:
df

Unnamed: 0,W,X,Y,Z
A,2.70685,0.628133,0.907969,0.503826
B,0.651118,-0.319318,-0.848077,0.605965
C,-2.018168,0.740122,0.528813,-0.589001
D,0.188695,-0.758872,-0.933237,0.955057
E,0.190794,1.978757,2.605967,0.683509


In [87]:
# When we try to reset the index the index will become a column and the index is set to numbers
# inplace = TRUE will replace the index in the original dataset
df.reset_index()

Unnamed: 0,index,W,X,Y,Z
0,A,2.70685,0.628133,0.907969,0.503826
1,B,0.651118,-0.319318,-0.848077,0.605965
2,C,-2.018168,0.740122,0.528813,-0.589001
3,D,0.188695,-0.758872,-0.933237,0.955057
4,E,0.190794,1.978757,2.605967,0.683509


In [88]:
# We can also set an index

newind = 'CA OH PA AZ NY'.split()

In [89]:
newind

['CA', 'OH', 'PA', 'AZ', 'NY']

In [93]:
# create a new column states in the dataframe

df['States'] = newind

In [94]:
df

Unnamed: 0,W,X,Y,Z,States
A,2.70685,0.628133,0.907969,0.503826,CA
B,0.651118,-0.319318,-0.848077,0.605965,OH
C,-2.018168,0.740122,0.528813,-0.589001,PA
D,0.188695,-0.758872,-0.933237,0.955057,AZ
E,0.190794,1.978757,2.605967,0.683509,NY


In [95]:
# To set States as Index

df.set_index('States')

Unnamed: 0_level_0,W,X,Y,Z
States,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
CA,2.70685,0.628133,0.907969,0.503826
OH,0.651118,-0.319318,-0.848077,0.605965
PA,-2.018168,0.740122,0.528813,-0.589001
AZ,0.188695,-0.758872,-0.933237,0.955057
NY,0.190794,1.978757,2.605967,0.683509


Multi-index and index hierarchy

In [106]:
# ZIP is used to pack the variables into tuples

Out= ['G1','G1','G1','G2','G2','G2']
In=[1,2,3,1,2,3]
hierarchy_index = list(zip(Out,In))
hierarchy_index = pd.MultiIndex.from_tuples(hierarchy_index)

In [108]:
hierarchy_index = list(zip(Out,In))

In [109]:
hierarchy_index

[('G1', 1), ('G1', 2), ('G1', 3), ('G2', 1), ('G2', 2), ('G2', 3)]

In [110]:
hierarchy_index = pd.MultiIndex.from_tuples(hierarchy_index)

In [111]:
hierarchy_index

MultiIndex(levels=[['G1', 'G2'], [1, 2, 3]],
           labels=[[0, 0, 0, 1, 1, 1], [0, 1, 2, 0, 1, 2]])

In [112]:
df = pd.DataFrame(randn(6,2),hierarchy_index,['A','B'])

In [113]:
df

Unnamed: 0,Unnamed: 1,A,B
G1,1,0.302665,1.693723
G1,2,-1.706086,-1.159119
G1,3,-0.134841,0.390528
G2,1,0.166905,0.184502
G2,2,0.807706,0.07296
G2,3,0.638787,0.329646


In [114]:
df.loc['G1']

Unnamed: 0,A,B
1,0.302665,1.693723
2,-1.706086,-1.159119
3,-0.134841,0.390528


In [115]:
df.loc['G1'].loc[2,'B']

-1.1591194155484297

To Name the indexes

In [116]:
df.index.names

FrozenList([None, None])

In [117]:
df.index.names = ['Groups','Segments']

In [118]:
df

Unnamed: 0_level_0,Unnamed: 1_level_0,A,B
Groups,Segments,Unnamed: 2_level_1,Unnamed: 3_level_1
G1,1,0.302665,1.693723
G1,2,-1.706086,-1.159119
G1,3,-0.134841,0.390528
G2,1,0.166905,0.184502
G2,2,0.807706,0.07296
G2,3,0.638787,0.329646


Cross Section

In [119]:
# We can use cross section xs to select elements from the dataframe

df.xs('G1')

Unnamed: 0_level_0,A,B
Segments,Unnamed: 1_level_1,Unnamed: 2_level_1
1,0.302665,1.693723
2,-1.706086,-1.159119
3,-0.134841,0.390528


In [120]:
# If both Segments 1s should be selected from both groups xs will do it

df.xs(1,level='Segments')

Unnamed: 0_level_0,A,B
Groups,Unnamed: 1_level_1,Unnamed: 2_level_1
G1,0.302665,1.693723
G2,0.166905,0.184502
