## Series

In [1]:
import numpy as np
import pandas as pd

In [2]:
print(np.__version__)

1.18.5


In [3]:
print(pd.__version__)

1.0.5


In [None]:
#pandas and numpy will always convert stuff to float in order to retain all information as possible

In [4]:
labels = ['a','b','c']
my_data = [1,2,3]
arr = np.array(my_data)
d = {'a':10, 'b':20, 'c':30}

In [5]:
pd.Series(data = my_data, index = labels)

a    1
b    2
c    3
dtype: int64

In [6]:
pd.Series(arr)

0    1
1    2
2    3
dtype: int32

In [7]:
pd.Series(arr, labels)

a    1
b    2
c    3
dtype: int32

In [8]:
d

{'a': 10, 'b': 20, 'c': 30}

In [9]:
pd.Series(d)

a    10
b    20
c    30
dtype: int64

In [10]:
pd.Series(data=labels)

0    a
1    b
2    c
dtype: object

In [11]:
ser1 = pd.Series([1,2,3,4],['Apple',"Papaya","Banana","Orange"])
ser1

Apple     1
Papaya    2
Banana    3
Orange    4
dtype: int64

In [19]:
ser2 = pd.Series(data=[1,2,5,4],index=["Apple","Papaya","Grape","Orange"])

In [20]:
ser2['Papaya']

2

In [21]:
ser2

Apple     1
Papaya    2
Grape     5
Orange    4
dtype: int64

In [22]:
ser1 + ser2

Apple     2.0
Banana    NaN
Grape     NaN
Orange    8.0
Papaya    4.0
dtype: float64

In [14]:
ser1["Apple"]

1

In [15]:
ser3 = pd.Series(data=labels)

In [16]:
ser3

0    a
1    b
2    c
dtype: object

In [17]:
ser3[0]

'a'

In [4]:
pd.Series(data=[print,sum,len])

0    <built-in function print>
1      <built-in function sum>
2      <built-in function len>
dtype: object

## Dataframe

In [8]:
import numpy as np
import pandas as pd

In [5]:
from numpy.random import randn

In [6]:
np.random.seed(101)

In [16]:
df = pd.DataFrame(randn(5,4),['A','B','C','D','E'],['W','X','Y','Z'])

In [17]:
df

Unnamed: 0,W,X,Y,Z
A,-0.993263,0.1968,-1.136645,0.000366
B,1.025984,-0.156598,-0.031579,0.649826
C,2.154846,-0.610259,-0.755325,-0.346419
D,0.147027,-0.479448,0.558769,1.02481
E,-0.925874,1.862864,-1.133817,0.610478


In [18]:
df['W'] #output as a Series

A   -0.993263
B    1.025984
C    2.154846
D    0.147027
E   -0.925874
Name: W, dtype: float64

In [19]:
type(df['W'])

pandas.core.series.Series

In [20]:
type(df)

pandas.core.frame.DataFrame

# We can think of a DataFrame as a bunch of Series that share the same index

In [22]:
df[['W','Z']] #output as dataframe

Unnamed: 0,W,Z
A,-0.993263,0.000366
B,1.025984,0.649826
C,2.154846,-0.346419
D,0.147027,1.02481
E,-0.925874,0.610478


In [23]:
df

Unnamed: 0,W,X,Y,Z
A,-0.993263,0.1968,-1.136645,0.000366
B,1.025984,-0.156598,-0.031579,0.649826
C,2.154846,-0.610259,-0.755325,-0.346419
D,0.147027,-0.479448,0.558769,1.02481
E,-0.925874,1.862864,-1.133817,0.610478


In [24]:
df['new']

KeyError: 'new'

In [25]:
df['new'] = df['W'] + df['X']

In [26]:
df

Unnamed: 0,W,X,Y,Z,new
A,-0.993263,0.1968,-1.136645,0.000366,-0.796464
B,1.025984,-0.156598,-0.031579,0.649826,0.869386
C,2.154846,-0.610259,-0.755325,-0.346419,1.544588
D,0.147027,-0.479448,0.558769,1.02481,-0.332421
E,-0.925874,1.862864,-1.133817,0.610478,0.93699


In [27]:
df.drop('new',axis=1)

Unnamed: 0,W,X,Y,Z
A,-0.993263,0.1968,-1.136645,0.000366
B,1.025984,-0.156598,-0.031579,0.649826
C,2.154846,-0.610259,-0.755325,-0.346419
D,0.147027,-0.479448,0.558769,1.02481
E,-0.925874,1.862864,-1.133817,0.610478


In [29]:
df

Unnamed: 0,W,X,Y,Z,new
A,-0.993263,0.1968,-1.136645,0.000366,-0.796464
B,1.025984,-0.156598,-0.031579,0.649826,0.869386
C,2.154846,-0.610259,-0.755325,-0.346419,1.544588
D,0.147027,-0.479448,0.558769,1.02481,-0.332421
E,-0.925874,1.862864,-1.133817,0.610478,0.93699


In [30]:
df.drop('new',axis=1,inplace=True)

In [31]:
df

Unnamed: 0,W,X,Y,Z
A,-0.993263,0.1968,-1.136645,0.000366
B,1.025984,-0.156598,-0.031579,0.649826
C,2.154846,-0.610259,-0.755325,-0.346419
D,0.147027,-0.479448,0.558769,1.02481
E,-0.925874,1.862864,-1.133817,0.610478


In [32]:
df.drop('E') # no inplace

Unnamed: 0,W,X,Y,Z
A,-0.993263,0.1968,-1.136645,0.000366
B,1.025984,-0.156598,-0.031579,0.649826
C,2.154846,-0.610259,-0.755325,-0.346419
D,0.147027,-0.479448,0.558769,1.02481


In [33]:
df.shape

(5, 4)

In [45]:
df.loc['A'] #not only the columns are Series, but the rows as well ---- labels based index location

W    2.706850
X    0.628133
Y    0.907969
Z    0.503826
Name: A, dtype: float64

In [34]:
df.iloc[2] #same as df.loc['C'] ---- numerical based index location

W    2.154846
X   -0.610259
Y   -0.755325
Z   -0.346419
Name: C, dtype: float64

In [39]:
df.loc['A','W']

-0.993263499973366

In [40]:
df

Unnamed: 0,W,X,Y,Z
A,-0.993263,0.1968,-1.136645,0.000366
B,1.025984,-0.156598,-0.031579,0.649826
C,2.154846,-0.610259,-0.755325,-0.346419
D,0.147027,-0.479448,0.558769,1.02481
E,-0.925874,1.862864,-1.133817,0.610478


In [42]:
df.loc[['C','E'],['X','Y']]

Unnamed: 0,X,Y
C,-0.610259,-0.755325
E,1.862864,-1.133817


## DataFrame 2

In [1]:
import numpy as np
import pandas as pd

In [2]:
from numpy.random import randn

In [3]:
np.random.seed(101)

In [4]:
df = pd.DataFrame(randn(5,4), ['A','B','C','D','E'], ['W','X','Y','Z'])

In [5]:
df

Unnamed: 0,W,X,Y,Z
A,2.70685,0.628133,0.907969,0.503826
B,0.651118,-0.319318,-0.848077,0.605965
C,-2.018168,0.740122,0.528813,-0.589001
D,0.188695,-0.758872,-0.933237,0.955057
E,0.190794,1.978757,2.605967,0.683509


In [6]:
df > 0

Unnamed: 0,W,X,Y,Z
A,True,True,True,True
B,True,False,False,True
C,False,True,True,False
D,True,False,False,True
E,True,True,True,True


In [7]:
booldf = df > 0
df[booldf]

Unnamed: 0,W,X,Y,Z
A,2.70685,0.628133,0.907969,0.503826
B,0.651118,,,0.605965
C,,0.740122,0.528813,
D,0.188695,,,0.955057
E,0.190794,1.978757,2.605967,0.683509


In [8]:
df[df > 0] #We can pass the dataframe that contain the boolean values (Conditional) to the dataframe. 
           #the result will only return the value that is true 

Unnamed: 0,W,X,Y,Z
A,2.70685,0.628133,0.907969,0.503826
B,0.651118,,,0.605965
C,,0.740122,0.528813,
D,0.188695,,,0.955057
E,0.190794,1.978757,2.605967,0.683509


In [15]:
df.loc[['A','C'],['W','X']]

Unnamed: 0,W,X
A,-0.586903,-2.418598
C,-0.101667,-1.521138


In [10]:
df['W'] > 0

A     True
B     True
C    False
D     True
E     True
Name: W, dtype: bool

In [19]:
df['W']

A    2.706850
B    0.651118
C   -2.018168
D    0.188695
E    0.190794
Name: W, dtype: float64

In [21]:
df[df['W'] > 0] #filter out the rows that has true value
#print the entire rows like this is for preventing the NaN value

Unnamed: 0,W,X,Y,Z
A,2.70685,0.628133,0.907969,0.503826
B,0.651118,-0.319318,-0.848077,0.605965
D,0.188695,-0.758872,-0.933237,0.955057
E,0.190794,1.978757,2.605967,0.683509


In [22]:
df

Unnamed: 0,W,X,Y,Z
A,2.70685,0.628133,0.907969,0.503826
B,0.651118,-0.319318,-0.848077,0.605965
C,-2.018168,0.740122,0.528813,-0.589001
D,0.188695,-0.758872,-0.933237,0.955057
E,0.190794,1.978757,2.605967,0.683509


In [24]:
df[df['Z']<0] #this is sub-data that we used by conditional selection

Unnamed: 0,W,X,Y,Z
C,-2.018168,0.740122,0.528813,-0.589001


In [30]:
resultdf = df[df['W']>0]

In [31]:
resultdf

Unnamed: 0,W,X,Y,Z
A,2.70685,0.628133,0.907969,0.503826
B,0.651118,-0.319318,-0.848077,0.605965
D,0.188695,-0.758872,-0.933237,0.955057
E,0.190794,1.978757,2.605967,0.683509


In [32]:
resultdf['X']

A    0.628133
B   -0.319318
D   -0.758872
E    1.978757
Name: X, dtype: float64

In [34]:
df[df['W']>0]

Unnamed: 0,W,X,Y,Z
A,2.70685,0.628133,0.907969,0.503826
B,0.651118,-0.319318,-0.848077,0.605965
D,0.188695,-0.758872,-0.933237,0.955057
E,0.190794,1.978757,2.605967,0.683509


In [35]:
df[df['W']>0]['X'] #return the column 'X' of the dataframe where the column value of W happens
#to be greater than zero

A    0.628133
B   -0.319318
D   -0.758872
E    1.978757
Name: X, dtype: float64

In [37]:
df[df['W']>0].loc[['A','B'],['W','Z']]

Unnamed: 0,W,Z
A,2.70685,0.503826
B,0.651118,0.605965


In [39]:
df

Unnamed: 0,W,X,Y,Z
A,2.70685,0.628133,0.907969,0.503826
B,0.651118,-0.319318,-0.848077,0.605965
C,-2.018168,0.740122,0.528813,-0.589001
D,0.188695,-0.758872,-0.933237,0.955057
E,0.190794,1.978757,2.605967,0.683509


## For example in every step

In [52]:
boolser = df['X']<0
result = df[boolser]
myrows = ['B']
mycols = ['X','Y','Z']
result.loc[myrows,mycols]

Unnamed: 0,X,Y,Z
B,-0.319318,-0.848077,0.605965


In [53]:
boolser

A    False
B     True
C    False
D     True
E    False
Name: X, dtype: bool

In [54]:
result

Unnamed: 0,W,X,Y,Z
B,0.651118,-0.319318,-0.848077,0.605965
D,0.188695,-0.758872,-0.933237,0.955057


## In one step

In [55]:
df[df['X']<0].loc[['B'],['X','Y','Z']]

Unnamed: 0,X,Y,Z
B,-0.319318,-0.848077,0.605965


In [56]:
#As you get more and more comfortable of pandas, you'll find yourself using this one lines more and more often

# Multiple conditions

In [61]:
df[(df['W']>0) and (df['Y']>1)]

ValueError: The truth value of a Series is ambiguous. Use a.empty, a.bool(), a.item(), a.any() or a.all().

In [58]:
# and operator can only take into accounts a single boolean at a time
True and False
#but the Series has more than one boolean value

False

In [59]:
df['W']>0

A     True
B     True
C    False
D     True
E     True
Name: W, dtype: bool

In [62]:
df

Unnamed: 0,W,X,Y,Z
A,2.70685,0.628133,0.907969,0.503826
B,0.651118,-0.319318,-0.848077,0.605965
C,-2.018168,0.740122,0.528813,-0.589001
D,0.188695,-0.758872,-0.933237,0.955057
E,0.190794,1.978757,2.605967,0.683509


In [63]:
#you must use an ampersands(&) for multiple conditions in pandas
df[(df['W']>0) & (df['Y']>1)]

Unnamed: 0,W,X,Y,Z
E,0.190794,1.978757,2.605967,0.683509


In [64]:
df[(df['W']>0) | (df['Y']>1)]

Unnamed: 0,W,X,Y,Z
A,2.70685,0.628133,0.907969,0.503826
B,0.651118,-0.319318,-0.848077,0.605965
D,0.188695,-0.758872,-0.933237,0.955057
E,0.190794,1.978757,2.605967,0.683509


In [65]:
#setting the index
df

Unnamed: 0,W,X,Y,Z
A,2.70685,0.628133,0.907969,0.503826
B,0.651118,-0.319318,-0.848077,0.605965
C,-2.018168,0.740122,0.528813,-0.589001
D,0.188695,-0.758872,-0.933237,0.955057
E,0.190794,1.978757,2.605967,0.683509


In [66]:
df.reset_index() #doesn't occur in place unless specifying it
#reset your index to a numerical index
#and take your old index, then set it as a new column called index

Unnamed: 0,index,W,X,Y,Z
0,A,2.70685,0.628133,0.907969,0.503826
1,B,0.651118,-0.319318,-0.848077,0.605965
2,C,-2.018168,0.740122,0.528813,-0.589001
3,D,0.188695,-0.758872,-0.933237,0.955057
4,E,0.190794,1.978757,2.605967,0.683509


In [71]:
newIndex = 'CA NY WY OR CO'.split()

In [72]:
df['States'] = newIndex

In [73]:
df

Unnamed: 0,W,X,Y,Z,States
A,2.70685,0.628133,0.907969,0.503826,CA
B,0.651118,-0.319318,-0.848077,0.605965,NY
C,-2.018168,0.740122,0.528813,-0.589001,WY
D,0.188695,-0.758872,-0.933237,0.955057,OR
E,0.190794,1.978757,2.605967,0.683509,CO


In [74]:
df.set_index('States') #set a column to be the index of the dataframe

Unnamed: 0_level_0,W,X,Y,Z
States,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
CA,2.70685,0.628133,0.907969,0.503826
NY,0.651118,-0.319318,-0.848077,0.605965
WY,-2.018168,0.740122,0.528813,-0.589001
OR,0.188695,-0.758872,-0.933237,0.955057
CO,0.190794,1.978757,2.605967,0.683509


In [75]:
df

Unnamed: 0,W,X,Y,Z,States
A,2.70685,0.628133,0.907969,0.503826,CA
B,0.651118,-0.319318,-0.848077,0.605965,NY
C,-2.018168,0.740122,0.528813,-0.589001,WY
D,0.188695,-0.758872,-0.933237,0.955057,OR
E,0.190794,1.978757,2.605967,0.683509,CO


# DATAFRAME 3

In [1]:
import numpy as np
import pandas as pd

## Multi-index DataFrame

In [2]:
outside = ['G1','G1','G1','G2','G2','G2']
inside = [1,2,3,1,2,3]
hier_index = list(zip(outside,inside))
hier_index = pd.MultiIndex.from_tuples(hier_index)

In [3]:
list(zip(outside,inside))

[('G1', 1), ('G1', 2), ('G1', 3), ('G2', 1), ('G2', 2), ('G2', 3)]

In [4]:
hier_index

MultiIndex([('G1', 1),
            ('G1', 2),
            ('G1', 3),
            ('G2', 1),
            ('G2', 2),
            ('G2', 3)],
           )

In [7]:
df = pd.DataFrame(randn(6,2),hier_index,['A','B'])
df

Unnamed: 0,Unnamed: 1,A,B
G1,1,2.70685,0.628133
G1,2,0.907969,0.503826
G1,3,0.651118,-0.319318
G2,1,-0.848077,0.605965
G2,2,-2.018168,0.740122
G2,3,0.528813,-0.589001


In [9]:
df.loc['G1']

Unnamed: 0,A,B
1,2.70685,0.628133
2,0.907969,0.503826
3,0.651118,-0.319318


In [10]:
df.loc['G1'].loc[1]

A    2.706850
B    0.628133
Name: 1, dtype: float64

In [12]:
df.index.names = ['Groups','Num']

In [13]:
df

Unnamed: 0_level_0,Unnamed: 1_level_0,A,B
Groups,Num,Unnamed: 2_level_1,Unnamed: 3_level_1
G1,1,2.70685,0.628133
G1,2,0.907969,0.503826
G1,3,0.651118,-0.319318
G2,1,-0.848077,0.605965
G2,2,-2.018168,0.740122
G2,3,0.528813,-0.589001


In [15]:
df.loc['G2'].loc[2,'B']

0.7401220570561068

In [16]:
df.loc['G2'].loc[[2],['B']]

Unnamed: 0_level_0,B
Num,Unnamed: 1_level_1
2,0.740122


In [17]:
df.loc['G2'].loc[2]

A   -2.018168
B    0.740122
Name: 2, dtype: float64

In [19]:
df.loc['G1'].loc[[2,3],['A','B']]

Unnamed: 0_level_0,A,B
Num,Unnamed: 1_level_1,Unnamed: 2_level_1
2,0.907969,0.503826
3,0.651118,-0.319318


In [20]:
df

Unnamed: 0_level_0,Unnamed: 1_level_0,A,B
Groups,Num,Unnamed: 2_level_1,Unnamed: 3_level_1
G1,1,2.70685,0.628133
G1,2,0.907969,0.503826
G1,3,0.651118,-0.319318
G2,1,-0.848077,0.605965
G2,2,-2.018168,0.740122
G2,3,0.528813,-0.589001


In [22]:
df.loc['G1']

Unnamed: 0_level_0,A,B
Num,Unnamed: 1_level_1,Unnamed: 2_level_1
1,2.70685,0.628133
2,0.907969,0.503826
3,0.651118,-0.319318


In [23]:
df.xs('G1') #cross section
#it has the ability to skip or go inside a multilevel index

Unnamed: 0_level_0,A,B
Num,Unnamed: 1_level_1,Unnamed: 2_level_1
1,2.70685,0.628133
2,0.907969,0.503826
3,0.651118,-0.319318


In [24]:
df.xs(1,level='Num')

Unnamed: 0_level_0,A,B
Groups,Unnamed: 1_level_1,Unnamed: 2_level_1
G1,2.70685,0.628133
G2,-0.848077,0.605965
