# Pandas - DataFrame

In [7]:
import numpy as np
import pandas as pd
from numpy.random import randn

In [30]:
# We get same random numbers
np.random.seed(101)

In [31]:
df = pd.DataFrame(randn(5, 4), [1, 2, 3, 4, 5], ['A', 'B', 'C', 'D'])

In [32]:
df

Unnamed: 0,A,B,C,D
1,2.70685,0.628133,0.907969,0.503826
2,0.651118,-0.319318,-0.848077,0.605965
3,-2.018168,0.740122,0.528813,-0.589001
4,0.188695,-0.758872,-0.933237,0.955057
5,0.190794,1.978757,2.605967,0.683509


In [11]:
df['A']

1    2.706850
2    0.651118
3   -2.018168
4    0.188695
5    0.190794
Name: A, dtype: float64

In [12]:
type(df)

pandas.core.frame.DataFrame

In [13]:
type(df['A'])

pandas.core.series.Series

In [14]:
df[['A', 'D']]

Unnamed: 0,A,D
1,2.70685,0.503826
2,0.651118,0.605965
3,-2.018168,-0.589001
4,0.188695,0.955057
5,0.190794,0.683509


In [15]:
df['A+C'] = df['A'] + df['C']

In [16]:
df

Unnamed: 0,A,B,C,D,A+C
1,2.70685,0.628133,0.907969,0.503826,3.614819
2,0.651118,-0.319318,-0.848077,0.605965,-0.196959
3,-2.018168,0.740122,0.528813,-0.589001,-1.489355
4,0.188695,-0.758872,-0.933237,0.955057,-0.744542
5,0.190794,1.978757,2.605967,0.683509,2.796762


In [17]:
# It not delete column permanantly
df.drop('A+C', axis=1)

Unnamed: 0,A,B,C,D
1,2.70685,0.628133,0.907969,0.503826
2,0.651118,-0.319318,-0.848077,0.605965
3,-2.018168,0.740122,0.528813,-0.589001
4,0.188695,-0.758872,-0.933237,0.955057
5,0.190794,1.978757,2.605967,0.683509


In [18]:
df

Unnamed: 0,A,B,C,D,A+C
1,2.70685,0.628133,0.907969,0.503826,3.614819
2,0.651118,-0.319318,-0.848077,0.605965,-0.196959
3,-2.018168,0.740122,0.528813,-0.589001,-1.489355
4,0.188695,-0.758872,-0.933237,0.955057,-0.744542
5,0.190794,1.978757,2.605967,0.683509,2.796762


In [19]:
# Delete permanantly
df.drop('A+C', axis=1, inplace=True)

In [20]:
df

Unnamed: 0,A,B,C,D
1,2.70685,0.628133,0.907969,0.503826
2,0.651118,-0.319318,-0.848077,0.605965
3,-2.018168,0.740122,0.528813,-0.589001
4,0.188695,-0.758872,-0.933237,0.955057
5,0.190794,1.978757,2.605967,0.683509


In [21]:
# Delete row
df.drop(5)

Unnamed: 0,A,B,C,D
1,2.70685,0.628133,0.907969,0.503826
2,0.651118,-0.319318,-0.848077,0.605965
3,-2.018168,0.740122,0.528813,-0.589001
4,0.188695,-0.758872,-0.933237,0.955057


In [22]:
df

Unnamed: 0,A,B,C,D
1,2.70685,0.628133,0.907969,0.503826
2,0.651118,-0.319318,-0.848077,0.605965
3,-2.018168,0.740122,0.528813,-0.589001
4,0.188695,-0.758872,-0.933237,0.955057
5,0.190794,1.978757,2.605967,0.683509


In [23]:
# Select row
df.loc[5]

A    0.190794
B    1.978757
C    2.605967
D    0.683509
Name: 5, dtype: float64

In [24]:
# Select by index
df.iloc[0]

A    2.706850
B    0.628133
C    0.907969
D    0.503826
Name: 1, dtype: float64

In [25]:
# Select perticular value
df.loc[1, 'A']

2.706849839399938

In [26]:
df.loc[[1, 2, 3], ['A', 'B']]

Unnamed: 0,A,B
1,2.70685,0.628133
2,0.651118,-0.319318
3,-2.018168,0.740122


In [34]:
booldf = df > 0

In [35]:
booldf

Unnamed: 0,A,B,C,D
1,True,True,True,True
2,True,False,False,True
3,False,True,True,False
4,True,False,False,True
5,True,True,True,True


In [36]:
df[booldf]

Unnamed: 0,A,B,C,D
1,2.70685,0.628133,0.907969,0.503826
2,0.651118,,,0.605965
3,,0.740122,0.528813,
4,0.188695,,,0.955057
5,0.190794,1.978757,2.605967,0.683509


In [37]:
df[df == 1]

Unnamed: 0,A,B,C,D
1,,,,
2,,,,
3,,,,
4,,,,
5,,,,


In [38]:
df['A']>0

1     True
2     True
3    False
4     True
5     True
Name: A, dtype: bool

In [41]:
# It shows only true values of 'B' column
df[df['B']>0]

Unnamed: 0,A,B,C,D
1,2.70685,0.628133,0.907969,0.503826
3,-2.018168,0.740122,0.528813,-0.589001
5,0.190794,1.978757,2.605967,0.683509


In [42]:
df

Unnamed: 0,A,B,C,D
1,2.70685,0.628133,0.907969,0.503826
2,0.651118,-0.319318,-0.848077,0.605965
3,-2.018168,0.740122,0.528813,-0.589001
4,0.188695,-0.758872,-0.933237,0.955057
5,0.190794,1.978757,2.605967,0.683509


In [45]:
df[df['C'] < 0]

Unnamed: 0,A,B,C,D
2,0.651118,-0.319318,-0.848077,0.605965
4,0.188695,-0.758872,-0.933237,0.955057


In [46]:
greaterThanZero = df[df['A']>0]

In [47]:
greaterThanZero

Unnamed: 0,A,B,C,D
1,2.70685,0.628133,0.907969,0.503826
2,0.651118,-0.319318,-0.848077,0.605965
4,0.188695,-0.758872,-0.933237,0.955057
5,0.190794,1.978757,2.605967,0.683509


In [48]:
greaterThanZero['B']

1    0.628133
2   -0.319318
4   -0.758872
5    1.978757
Name: B, dtype: float64

In [49]:
greaterThanZero['B'][1]

0.6281327087844596

In [50]:
df.reset_index()

Unnamed: 0,index,A,B,C,D
0,1,2.70685,0.628133,0.907969,0.503826
1,2,0.651118,-0.319318,-0.848077,0.605965
2,3,-2.018168,0.740122,0.528813,-0.589001
3,4,0.188695,-0.758872,-0.933237,0.955057
4,5,0.190794,1.978757,2.605967,0.683509


In [51]:
df

Unnamed: 0,A,B,C,D
1,2.70685,0.628133,0.907969,0.503826
2,0.651118,-0.319318,-0.848077,0.605965
3,-2.018168,0.740122,0.528813,-0.589001
4,0.188695,-0.758872,-0.933237,0.955057
5,0.190794,1.978757,2.605967,0.683509


In [52]:
vehicles = ['Ducati', 'Audi', 'BMW', 'Mercedes-Benz', 'Hundai']

In [53]:
vehicles

['Ducati', 'Audi', 'BMW', 'Mercedes-Benz', 'Hundai']

In [58]:
df['vehicles'] = vehicles

In [59]:
df

Unnamed: 0,A,B,C,D,vehicles
1,2.70685,0.628133,0.907969,0.503826,Ducati
2,0.651118,-0.319318,-0.848077,0.605965,Audi
3,-2.018168,0.740122,0.528813,-0.589001,BMW
4,0.188695,-0.758872,-0.933237,0.955057,Mercedes-Benz
5,0.190794,1.978757,2.605967,0.683509,Hundai


In [61]:
df.set_index('vehicles')

Unnamed: 0_level_0,A,B,C,D
vehicles,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
Ducati,2.70685,0.628133,0.907969,0.503826
Audi,0.651118,-0.319318,-0.848077,0.605965
BMW,-2.018168,0.740122,0.528813,-0.589001
Mercedes-Benz,0.188695,-0.758872,-0.933237,0.955057
Hundai,0.190794,1.978757,2.605967,0.683509


In [63]:
df

Unnamed: 0,A,B,C,D,vehicles
1,2.70685,0.628133,0.907969,0.503826,Ducati
2,0.651118,-0.319318,-0.848077,0.605965,Audi
3,-2.018168,0.740122,0.528813,-0.589001,BMW
4,0.188695,-0.758872,-0.933237,0.955057,Mercedes-Benz
5,0.190794,1.978757,2.605967,0.683509,Hundai


In [69]:
# data frame
outside = ['G1', 'G1', 'G1', 'G2', 'G2', 'G2']
inside = [1, 2, 3, 1, 2, 3]
hier_index = list(zip(outside, inside))
hier_index = pd.MultiIndex.from_tuples(hier_index)

In [70]:
hier_index

MultiIndex([('G1', 1),
            ('G1', 2),
            ('G1', 3),
            ('G2', 1),
            ('G2', 2),
            ('G2', 3)],
           )

In [71]:
new_df = pd.DataFrame(randn(6, 2), hier_index, ['A', 'B'])

In [72]:
new_df

Unnamed: 0,Unnamed: 1,A,B
G1,1,0.302665,1.693723
G1,2,-1.706086,-1.159119
G1,3,-0.134841,0.390528
G2,1,0.166905,0.184502
G2,2,0.807706,0.07296
G2,3,0.638787,0.329646


In [73]:
new_df.loc['G1']

Unnamed: 0,A,B
1,0.302665,1.693723
2,-1.706086,-1.159119
3,-0.134841,0.390528


In [75]:
new_df.loc['G1'].loc[1]

A    0.302665
B    1.693723
Name: 1, dtype: float64

In [76]:
new_df.loc['G1'].loc[1]['A']

0.3026654485851825

In [77]:
new_df.index.names

FrozenList([None, None])

In [78]:
new_df.index.names = ['Groups', 'S. N.']

In [79]:
new_df

Unnamed: 0_level_0,Unnamed: 1_level_0,A,B
Groups,S. N.,Unnamed: 2_level_1,Unnamed: 3_level_1
G1,1,0.302665,1.693723
G1,2,-1.706086,-1.159119
G1,3,-0.134841,0.390528
G2,1,0.166905,0.184502
G2,2,0.807706,0.07296
G2,3,0.638787,0.329646


In [80]:
new_df.xs(2, level='S. N.')

Unnamed: 0_level_0,A,B
Groups,Unnamed: 1_level_1,Unnamed: 2_level_1
G1,-1.706086,-1.159119
G2,0.807706,0.07296
