# DataFrames

DataFrames concept in python is similar to that of R programming language. DataFrame is a collection of Series combined together to share the same index positions.

In [1]:
import pandas as pd
import numpy as np

In [2]:
from numpy.random import randn
np.random.seed(1)

In [5]:
dataframe = pd.DataFrame(randn(10,5),index='A B C D E F G H I J'.split())
dataframe


Unnamed: 0,0,1,2,3,4
A,-0.447129,1.224508,0.403492,0.593579,-1.094912
B,0.169382,0.740556,-0.953701,-0.266219,0.032615
C,-1.373117,0.315159,0.846161,-0.859516,0.350546
D,-1.312283,-0.038696,-1.615772,1.121418,0.408901
E,-0.024617,-0.775162,1.273756,1.967102,-1.857982
F,1.236164,1.627651,0.338012,-1.199268,0.863345
G,-0.18092,-0.603921,-1.230058,0.550537,0.792807
H,-0.623531,0.520576,-1.144341,0.801861,0.046567
I,-0.18657,-0.101746,0.868886,0.750412,0.529465
J,0.137701,0.077821,0.61838,0.232495,0.682551


In [51]:
dataframe = pd.DataFrame(randn(10,5),index='A B C D E F G H I J'.split(),columns='Score1 Score2 Score3 Score4 Score5'.split())

In [52]:
dataframe

Unnamed: 0,Score1,Score2,Score3,Score4,Score5
A,-0.447129,1.224508,0.403492,0.593579,-1.094912
B,0.169382,0.740556,-0.953701,-0.266219,0.032615
C,-1.373117,0.315159,0.846161,-0.859516,0.350546
D,-1.312283,-0.038696,-1.615772,1.121418,0.408901
E,-0.024617,-0.775162,1.273756,1.967102,-1.857982
F,1.236164,1.627651,0.338012,-1.199268,0.863345
G,-0.18092,-0.603921,-1.230058,0.550537,0.792807
H,-0.623531,0.520576,-1.144341,0.801861,0.046567
I,-0.18657,-0.101746,0.868886,0.750412,0.529465
J,0.137701,0.077821,0.61838,0.232495,0.682551


## Selection and Indexing

Ways in which we can grab data from a DataFrame

In [6]:
dataframe['Score3']

A   -1.142518
B    0.931102
C    0.512930
D    1.519817
E    0.160037
F    0.827975
G    0.186561
H    0.377564
I   -0.375285
J    0.043597
Name: Score3, dtype: float64

In [7]:
# Pass a list of column names in any order necessary
dataframe[['Score2','Score1']]

Unnamed: 0,Score2,Score1
A,-0.35225,0.30017
B,0.838983,0.586623
C,1.252868,-0.754398
D,1.131629,-0.075572
E,-0.504466,-1.444114
F,-0.306204,-2.022201
G,-0.200758,-0.222328
H,-0.670662,0.119009
I,0.185156,1.198918
J,-0.343854,0.07734


DataFrame Columns are nothing but a Series each

In [8]:
type(dataframe['Score1'])

pandas.core.series.Series

**Adding a new column to the DataFrame**

In [9]:
dataframe['Score6'] = dataframe['Score1'] + dataframe['Score2']

In [10]:
dataframe

Unnamed: 0,Score1,Score2,Score3,Score4,Score5,Score6
A,0.30017,-0.35225,-1.142518,-0.349343,-0.208894,-0.05208
B,0.586623,0.838983,0.931102,0.285587,0.885141,1.425607
C,-0.754398,1.252868,0.51293,-0.298093,0.488518,0.49847
D,-0.075572,1.131629,1.519817,2.185575,-1.396496,1.056058
E,-1.444114,-0.504466,0.160037,0.876169,0.315635,-1.94858
F,-2.022201,-0.306204,0.827975,0.230095,0.762011,-2.328405
G,-0.222328,-0.200758,0.186561,0.410052,0.1983,-0.423086
H,0.119009,-0.670662,0.377564,0.121821,1.129484,-0.551654
I,1.198918,0.185156,-0.375285,-0.63873,0.423494,1.384074
J,0.07734,-0.343854,0.043597,-0.620001,0.698032,-0.266514


** Removing Columns from DataFrame**

In [15]:
dataframe.drop('Score6',axis=1)              # Use axis=0 for dropping rows and axis=1 for dropping columns

Unnamed: 0,Score1,Score2,Score3,Score4,Score5
A,0.30017,-0.35225,-1.142518,-0.349343,-0.208894
B,0.586623,0.838983,0.931102,0.285587,0.885141
C,-0.754398,1.252868,0.51293,-0.298093,0.488518
D,-0.075572,1.131629,1.519817,2.185575,-1.396496
E,-1.444114,-0.504466,0.160037,0.876169,0.315635
F,-2.022201,-0.306204,0.827975,0.230095,0.762011
G,-0.222328,-0.200758,0.186561,0.410052,0.1983
H,0.119009,-0.670662,0.377564,0.121821,1.129484
I,1.198918,0.185156,-0.375285,-0.63873,0.423494
J,0.07734,-0.343854,0.043597,-0.620001,0.698032


In [16]:
# column is not dropped unless inplace input is TRUE
dataframe

Unnamed: 0,Score1,Score2,Score3,Score4,Score5,Score6
A,0.30017,-0.35225,-1.142518,-0.349343,-0.208894,-0.05208
B,0.586623,0.838983,0.931102,0.285587,0.885141,1.425607
C,-0.754398,1.252868,0.51293,-0.298093,0.488518,0.49847
D,-0.075572,1.131629,1.519817,2.185575,-1.396496,1.056058
E,-1.444114,-0.504466,0.160037,0.876169,0.315635,-1.94858
F,-2.022201,-0.306204,0.827975,0.230095,0.762011,-2.328405
G,-0.222328,-0.200758,0.186561,0.410052,0.1983,-0.423086
H,0.119009,-0.670662,0.377564,0.121821,1.129484,-0.551654
I,1.198918,0.185156,-0.375285,-0.63873,0.423494,1.384074
J,0.07734,-0.343854,0.043597,-0.620001,0.698032,-0.266514


In [17]:
dataframe.drop('Score6',axis=1,inplace=True)

In [21]:
dataframe

Unnamed: 0,Score1,Score2,Score3,Score4,Score5
A,0.30017,-0.35225,-1.142518,-0.349343,-0.208894
B,0.586623,0.838983,0.931102,0.285587,0.885141
C,-0.754398,1.252868,0.51293,-0.298093,0.488518
D,-0.075572,1.131629,1.519817,2.185575,-1.396496
E,-1.444114,-0.504466,0.160037,0.876169,0.315635
F,-2.022201,-0.306204,0.827975,0.230095,0.762011
G,-0.222328,-0.200758,0.186561,0.410052,0.1983
H,0.119009,-0.670662,0.377564,0.121821,1.129484
I,1.198918,0.185156,-0.375285,-0.63873,0.423494
J,0.07734,-0.343854,0.043597,-0.620001,0.698032


Dropping rows using axis=0

In [22]:
dataframe.drop('A',axis=0)      # Row will also be dropped only if inplace=TRUE is given as input

Unnamed: 0,Score1,Score2,Score3,Score4,Score5
B,0.586623,0.838983,0.931102,0.285587,0.885141
C,-0.754398,1.252868,0.51293,-0.298093,0.488518
D,-0.075572,1.131629,1.519817,2.185575,-1.396496
E,-1.444114,-0.504466,0.160037,0.876169,0.315635
F,-2.022201,-0.306204,0.827975,0.230095,0.762011
G,-0.222328,-0.200758,0.186561,0.410052,0.1983
H,0.119009,-0.670662,0.377564,0.121821,1.129484
I,1.198918,0.185156,-0.375285,-0.63873,0.423494
J,0.07734,-0.343854,0.043597,-0.620001,0.698032


** Selecting Rows**

In [26]:
dataframe.loc['F']

Score1   -2.022201
Score2   -0.306204
Score3    0.827975
Score4    0.230095
Score5    0.762011
Name: F, dtype: float64

** Or select based off of index position instead of label - use iloc instead of loc function **

In [24]:
dataframe.iloc[2]

Score1   -0.754398
Score2    1.252868
Score3    0.512930
Score4   -0.298093
Score5    0.488518
Name: C, dtype: float64

** Selecting subset of rows and columns using loc function **

In [27]:
dataframe.loc['A','Score1']

0.30017031995582749

In [28]:
dataframe.loc[['A','B'],['Score1','Score2']]

Unnamed: 0,Score1,Score2
A,0.30017,-0.35225
B,0.586623,0.838983


### Conditional Selection

Similar to NumPy, we can make conditional selections using Brackets

In [29]:
dataframe

Unnamed: 0,Score1,Score2,Score3,Score4,Score5
A,0.30017,-0.35225,-1.142518,-0.349343,-0.208894
B,0.586623,0.838983,0.931102,0.285587,0.885141
C,-0.754398,1.252868,0.51293,-0.298093,0.488518
D,-0.075572,1.131629,1.519817,2.185575,-1.396496
E,-1.444114,-0.504466,0.160037,0.876169,0.315635
F,-2.022201,-0.306204,0.827975,0.230095,0.762011
G,-0.222328,-0.200758,0.186561,0.410052,0.1983
H,0.119009,-0.670662,0.377564,0.121821,1.129484
I,1.198918,0.185156,-0.375285,-0.63873,0.423494
J,0.07734,-0.343854,0.043597,-0.620001,0.698032


In [30]:
dataframe>0.5

Unnamed: 0,Score1,Score2,Score3,Score4,Score5
A,False,False,False,False,False
B,True,True,True,False,True
C,False,True,True,False,False
D,False,True,True,True,False
E,False,False,False,True,False
F,False,False,True,False,True
G,False,False,False,False,False
H,False,False,False,False,True
I,True,False,False,False,False
J,False,False,False,False,True


In [31]:
dataframe[dataframe>0.5]

Unnamed: 0,Score1,Score2,Score3,Score4,Score5
A,,,,,
B,0.586623,0.838983,0.931102,,0.885141
C,,1.252868,0.51293,,
D,,1.131629,1.519817,2.185575,
E,,,,0.876169,
F,,,0.827975,,0.762011
G,,,,,
H,,,,,1.129484
I,1.198918,,,,
J,,,,,0.698032


In [32]:
dataframe[dataframe['Score1']>0.5]

Unnamed: 0,Score1,Score2,Score3,Score4,Score5
B,0.586623,0.838983,0.931102,0.285587,0.885141
I,1.198918,0.185156,-0.375285,-0.63873,0.423494


In [33]:
dataframe[dataframe['Score1']>0.5]['Score2']

B    0.838983
I    0.185156
Name: Score2, dtype: float64

In [34]:
dataframe[dataframe['Score1']>0.5][['Score2','Score3']]

Unnamed: 0,Score2,Score3
B,0.838983,0.931102
I,0.185156,-0.375285


For multiple conditions you can use | and & with parenthesis

In [37]:
dataframe[(dataframe['Score1']>0.5) & (dataframe['Score2'] > 0)]

Unnamed: 0,Score1,Score2,Score3,Score4,Score5
B,0.586623,0.838983,0.931102,0.285587,0.885141
I,1.198918,0.185156,-0.375285,-0.63873,0.423494


## More Index Details

Some more features of indexing includes 
  - resetting the index 
  - setting a different value
  - index hierarchy

In [39]:
dataframe

Unnamed: 0,Score1,Score2,Score3,Score4,Score5
A,0.30017,-0.35225,-1.142518,-0.349343,-0.208894
B,0.586623,0.838983,0.931102,0.285587,0.885141
C,-0.754398,1.252868,0.51293,-0.298093,0.488518
D,-0.075572,1.131629,1.519817,2.185575,-1.396496
E,-1.444114,-0.504466,0.160037,0.876169,0.315635
F,-2.022201,-0.306204,0.827975,0.230095,0.762011
G,-0.222328,-0.200758,0.186561,0.410052,0.1983
H,0.119009,-0.670662,0.377564,0.121821,1.129484
I,1.198918,0.185156,-0.375285,-0.63873,0.423494
J,0.07734,-0.343854,0.043597,-0.620001,0.698032


In [53]:
# Reset to default index value instead of A to J
dataframe.reset_index()

Unnamed: 0,index,Score1,Score2,Score3,Score4,Score5
0,A,-0.447129,1.224508,0.403492,0.593579,-1.094912
1,B,0.169382,0.740556,-0.953701,-0.266219,0.032615
2,C,-1.373117,0.315159,0.846161,-0.859516,0.350546
3,D,-1.312283,-0.038696,-1.615772,1.121418,0.408901
4,E,-0.024617,-0.775162,1.273756,1.967102,-1.857982
5,F,1.236164,1.627651,0.338012,-1.199268,0.863345
6,G,-0.18092,-0.603921,-1.230058,0.550537,0.792807
7,H,-0.623531,0.520576,-1.144341,0.801861,0.046567
8,I,-0.18657,-0.101746,0.868886,0.750412,0.529465
9,J,0.137701,0.077821,0.61838,0.232495,0.682551


In [54]:
# Setting new index value
newindex = 'IND JP CAN GE IT PL FY IU RT IP'.split()

In [55]:
dataframe['Countries'] = newindex

In [56]:
dataframe

Unnamed: 0,Score1,Score2,Score3,Score4,Score5,Countries
A,-0.447129,1.224508,0.403492,0.593579,-1.094912,IND
B,0.169382,0.740556,-0.953701,-0.266219,0.032615,JP
C,-1.373117,0.315159,0.846161,-0.859516,0.350546,CAN
D,-1.312283,-0.038696,-1.615772,1.121418,0.408901,GE
E,-0.024617,-0.775162,1.273756,1.967102,-1.857982,IT
F,1.236164,1.627651,0.338012,-1.199268,0.863345,PL
G,-0.18092,-0.603921,-1.230058,0.550537,0.792807,FY
H,-0.623531,0.520576,-1.144341,0.801861,0.046567,IU
I,-0.18657,-0.101746,0.868886,0.750412,0.529465,RT
J,0.137701,0.077821,0.61838,0.232495,0.682551,IP


In [57]:
dataframe.set_index('Countries')

Unnamed: 0_level_0,Score1,Score2,Score3,Score4,Score5
Countries,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
IND,-0.447129,1.224508,0.403492,0.593579,-1.094912
JP,0.169382,0.740556,-0.953701,-0.266219,0.032615
CAN,-1.373117,0.315159,0.846161,-0.859516,0.350546
GE,-1.312283,-0.038696,-1.615772,1.121418,0.408901
IT,-0.024617,-0.775162,1.273756,1.967102,-1.857982
PL,1.236164,1.627651,0.338012,-1.199268,0.863345
FY,-0.18092,-0.603921,-1.230058,0.550537,0.792807
IU,-0.623531,0.520576,-1.144341,0.801861,0.046567
RT,-0.18657,-0.101746,0.868886,0.750412,0.529465
IP,0.137701,0.077821,0.61838,0.232495,0.682551


In [59]:
# Once again, ensure that you input inplace=TRUE
dataframe

Unnamed: 0,Score1,Score2,Score3,Score4,Score5,Countries
A,-0.447129,1.224508,0.403492,0.593579,-1.094912,IND
B,0.169382,0.740556,-0.953701,-0.266219,0.032615,JP
C,-1.373117,0.315159,0.846161,-0.859516,0.350546,CAN
D,-1.312283,-0.038696,-1.615772,1.121418,0.408901,GE
E,-0.024617,-0.775162,1.273756,1.967102,-1.857982,IT
F,1.236164,1.627651,0.338012,-1.199268,0.863345,PL
G,-0.18092,-0.603921,-1.230058,0.550537,0.792807,FY
H,-0.623531,0.520576,-1.144341,0.801861,0.046567,IU
I,-0.18657,-0.101746,0.868886,0.750412,0.529465,RT
J,0.137701,0.077821,0.61838,0.232495,0.682551,IP


In [60]:
dataframe.set_index('Countries',inplace=True)

In [61]:
dataframe

Unnamed: 0_level_0,Score1,Score2,Score3,Score4,Score5
Countries,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
IND,-0.447129,1.224508,0.403492,0.593579,-1.094912
JP,0.169382,0.740556,-0.953701,-0.266219,0.032615
CAN,-1.373117,0.315159,0.846161,-0.859516,0.350546
GE,-1.312283,-0.038696,-1.615772,1.121418,0.408901
IT,-0.024617,-0.775162,1.273756,1.967102,-1.857982
PL,1.236164,1.627651,0.338012,-1.199268,0.863345
FY,-0.18092,-0.603921,-1.230058,0.550537,0.792807
IU,-0.623531,0.520576,-1.144341,0.801861,0.046567
RT,-0.18657,-0.101746,0.868886,0.750412,0.529465
IP,0.137701,0.077821,0.61838,0.232495,0.682551


### The END