# DataFrames

DataFrames are a bunch of Series objects put together to share the same index.

In [1]:
import pandas as pd
import numpy as np

# Create DataFrame
A pandas DataFrame can be created using various inputs like −

Lists
dict
Series
Numpy ndarrays
Another DataFrame


## Create a DataFrame from Lists

The DataFrame can be created using a single list or a list of lists.

In [2]:
a = [10,20,30,40,50]
B = pd.DataFrame(a)
print(B)
# B[0]

    0
0  10
1  20
2  30
3  40
4  50


In [7]:
data = [['Alex',10],['Bob',12],['Clarke',13]]
df = pd.DataFrame(data, columns=['Name','Age'])
# print(df)
df['Age']

0    10
1    12
2    13
Name: Age, dtype: int64

In [8]:
data = [['Alex',10],['Bob',12],['Clarke',13]]
df = pd.DataFrame(data,columns=['Name','Age'],dtype=float)
print(df)

     Name   Age
0    Alex  10.0
1     Bob  12.0
2  Clarke  13.0


## Create a DataFrame from Dict of Lists/ndarrays

All the ndarrays must be of same length. 
If index is passed, then the length of the index should equal to the length of the arrays.

In [9]:
a = {'Name':['Tom', 'Jack', 'Steve', 'Ricky'],'Age':[28,34,29,42]}
# print(type(a))
df = pd.DataFrame(a)
print(df)

    Name  Age
0    Tom   28
1   Jack   34
2  Steve   29
3  Ricky   42


In [10]:
a = {'Name':np.array(['Tom', 'Jack', 'Steve', 'Ricky']),'Age':np.array([28,34,29,42])}
df = pd.DataFrame(a)
print(df)

    Name  Age
0    Tom   28
1   Jack   34
2  Steve   29
3  Ricky   42


In [11]:
a = {'Name':['Tom', 'Jack', 'Steve', 'Ricky'],'Age':[28,34,29,42]}
df = pd.DataFrame(a, index=['rank1','rank2','rank3','rank4'])
print(df)

        Name  Age
rank1    Tom   28
rank2   Jack   34
rank3  Steve   29
rank4  Ricky   42


In [15]:
df = pd.DataFrame(np.random.randn(5,4),index='A B C D E'.split(),columns='W X Y Z'.split())
# 'A B C D E'.split()
df

Unnamed: 0,W,X,Y,Z
A,-0.87768,1.239049,0.618773,0.622085
B,0.04604,-0.800241,0.953133,0.591053
C,-0.221279,-0.704658,-0.053478,-0.011924
D,-0.355951,-0.184137,0.060732,0.396691
E,0.66877,-0.793161,-0.379663,2.300234


In [13]:
# df
# np.random.randn(5,4)
'A B C D E'.split()

['A', 'B', 'C', 'D', 'E']

## Create a DataFrame from Dict of Series

In [16]:
d = {'one' : pd.Series([1, 2, 3], index=['a', 'b', 'c']),
   'two' : pd.Series([1, 2, 3, 4], index=['a', 'b', 'c', 'd'])}
df = pd.DataFrame(d)
df

Unnamed: 0,one,two
a,1.0,1
b,2.0,2
c,3.0,3
d,,4


## Selection and Indexing

Let's learn the various methods to grab data from a DataFrame

In [26]:
df = pd.DataFrame(np.random.randn(5,4),index='A B C D E'.split(),columns='W X Y Z'.split())
df

Unnamed: 0,W,X,Y,Z
A,-0.023082,1.056658,0.328288,-0.256404
B,-0.445849,0.464248,-1.303961,-1.730333
C,0.201453,-0.559383,-0.363483,-0.146483
D,1.882406,1.114555,-0.334895,1.125398
E,-1.153043,1.907769,0.165622,-0.091237


In [19]:
df['W']

A    2.213157
B   -0.266284
C    0.364260
D   -0.893653
E    1.911165
Name: W, dtype: float64

In [18]:
df['W']['A']

2.213157155411523

In [24]:
# Pass a list of column names
df[['W','Z','X']]

Unnamed: 0,W,Z,X
A,2.213157,0.219666,-0.959896
B,-0.266284,-0.238207,-0.350023
C,0.36426,-0.353315,-1.16636
D,-0.893653,-1.151851,2.738225
E,1.911165,1.505695,0.394789


DataFrame Columns are just Series

In [27]:
type(df['W'])

pandas.core.series.Series

**Creating a new column:**

In [31]:
# df['new'] = [1,2,3,4,5]
df['sumWY'] = df['W'] + df['Y']
df

Unnamed: 0,W,X,Y,Z,new,sumWY
A,-0.023082,1.056658,0.328288,-0.256404,1,0.305205
B,-0.445849,0.464248,-1.303961,-1.730333,2,-1.74981
C,0.201453,-0.559383,-0.363483,-0.146483,3,-0.16203
D,1.882406,1.114555,-0.334895,1.125398,4,1.547511
E,-1.153043,1.907769,0.165622,-0.091237,5,-0.987421


In [32]:
df

Unnamed: 0,W,X,Y,Z,new,sumWY
A,-0.023082,1.056658,0.328288,-0.256404,1,0.305205
B,-0.445849,0.464248,-1.303961,-1.730333,2,-1.74981
C,0.201453,-0.559383,-0.363483,-0.146483,3,-0.16203
D,1.882406,1.114555,-0.334895,1.125398,4,1.547511
E,-1.153043,1.907769,0.165622,-0.091237,5,-0.987421


In [33]:
type(df['new'])

pandas.core.series.Series

** Removing Columns**

In [35]:
df.drop('A', axis=0)
#df.drop('new')
# df.drop('A')
# df.drop('W',axis=1)

Unnamed: 0,W,X,Y,Z,new,sumWY
B,-0.445849,0.464248,-1.303961,-1.730333,2,-1.74981
C,0.201453,-0.559383,-0.363483,-0.146483,3,-0.16203
D,1.882406,1.114555,-0.334895,1.125398,4,1.547511
E,-1.153043,1.907769,0.165622,-0.091237,5,-0.987421


In [None]:
# Not inplace unless specified!
df

In [39]:
df.drop('new',axis=1,inplace=True)


In [40]:
df

Unnamed: 0,W,X,Y,Z,sumWY
A,-0.023082,1.056658,0.328288,-0.256404,0.305205
B,-0.445849,0.464248,-1.303961,-1.730333,-1.74981
C,0.201453,-0.559383,-0.363483,-0.146483,-0.16203
D,1.882406,1.114555,-0.334895,1.125398,1.547511
E,-1.153043,1.907769,0.165622,-0.091237,-0.987421


Can also drop rows this way:

In [None]:
df.drop('E',axis=0)

** Selecting Rows**

In [41]:
df.loc['A']

W       -0.023082
X        1.056658
Y        0.328288
Z       -0.256404
sumWY    0.305205
Name: A, dtype: float64

Or select based off of position instead of label 

In [45]:
# print(df)
df.iloc[0]

W       -0.023082
X        1.056658
Y        0.328288
Z       -0.256404
sumWY    0.305205
Name: A, dtype: float64

** Selecting subset of rows and columns **

In [49]:
# print(df.loc['B','Y'])
df['Y']['B']


-1.3039611107402218

In [51]:
df.loc[['A','B'],['W','Y']]
# df

Unnamed: 0,W,Y
A,-0.023082,0.328288
B,-0.445849,-1.303961


### Conditional Selection

An important feature of pandas is conditional selection using bracket notation, very similar to numpy:

In [52]:
df

Unnamed: 0,W,X,Y,Z,sumWY
A,-0.023082,1.056658,0.328288,-0.256404,0.305205
B,-0.445849,0.464248,-1.303961,-1.730333,-1.74981
C,0.201453,-0.559383,-0.363483,-0.146483,-0.16203
D,1.882406,1.114555,-0.334895,1.125398,1.547511
E,-1.153043,1.907769,0.165622,-0.091237,-0.987421


In [53]:
df>0

Unnamed: 0,W,X,Y,Z,sumWY
A,False,True,True,False,True
B,False,True,False,False,False
C,True,False,False,False,False
D,True,True,False,True,True
E,False,True,True,False,False


In [54]:
df[df>0]

Unnamed: 0,W,X,Y,Z,sumWY
A,,1.056658,0.328288,,0.305205
B,,0.464248,,,
C,0.201453,,,,
D,1.882406,1.114555,,1.125398,1.547511
E,,1.907769,0.165622,,


In [56]:
# print(df)
df[df['W']>0]

Unnamed: 0,W,X,Y,Z,sumWY
C,0.201453,-0.559383,-0.363483,-0.146483,-0.16203
D,1.882406,1.114555,-0.334895,1.125398,1.547511


In [None]:
print(df)
df[df['W']>0]['Y']

In [None]:
df[df['W']>0][['Y','X']]

For two conditions you can use | and & with parenthesis:

In [None]:
df[(df['W']>0) & (df['Y'] > 1)]

## More Index Details



In [None]:
df

In [None]:
# Reset to default 0,1...n index
df.reset_index()

In [None]:
newind = 'CA NY WY OR CO'.split()

In [None]:
df['States'] = newind

In [None]:
df

In [None]:
df.set_index('States')

In [None]:
df

In [None]:
df.set_index('States',inplace=True)

In [None]:
df