In [1]:
In [1]: import numpy as np
In [2]: import pandas as pd

# Series

In [2]:
s = pd.Series(np.random.randn(5), index=['a', 'b', 'c', 'd', 'e'])

In [3]:
s

a    0.352473
b   -0.456347
c    1.881415
d   -0.147593
e   -0.167223
dtype: float64

In [4]:
s.index

Index(['a', 'b', 'c', 'd', 'e'], dtype='object')

In [5]:
pd.Series(np.random.rand(5))

0    0.202643
1    0.208093
2    0.807589
3    0.352301
4    0.759115
dtype: float64

In [7]:
d = {'b': 1, 'a': 0, 'c': 2}
pd.Series(d)

b    1
a    0
c    2
dtype: int64

In [8]:
# series are LIKE ndarrays
s[0]

0.3524729224576593

In [9]:
s[s > s.median()]

a    0.352473
c    1.881415
dtype: float64

In [10]:
np.exp(s)

a    1.422581
b    0.633594
c    6.562788
d    0.862782
e    0.846011
dtype: float64

In [11]:
s.dtype

dtype('float64')

In [12]:
s.to_numpy()

array([ 0.35247292, -0.45634662,  1.88141544, -0.14759311, -0.16722319])

In [13]:
# series are also like dicts
s['a']

0.3524729224576593

In [15]:
s['e'] = 12
s

a     0.352473
b    -0.456347
c     1.881415
d    -0.147593
e    12.000000
dtype: float64

In [16]:
'e' in s

True

In [17]:
# looping is not necessary for math
s + s

a     0.704946
b    -0.912693
c     3.762831
d    -0.295186
e    24.000000
dtype: float64

In [18]:
s * 2

a     0.704946
b    -0.912693
c     3.762831
d    -0.295186
e    24.000000
dtype: float64

In [19]:
np.exp(s)

a         1.422581
b         0.633594
c         6.562788
d         0.862782
e    162754.791419
dtype: float64

In [20]:
# difference being, series can alidn data automatically based on label
s1 = s[1:]
s2 = s[:-1]
s1 + s2

# NaN is shown for missing labels

a         NaN
b   -0.912693
c    3.762831
d   -0.295186
e         NaN
dtype: float64

In [24]:
# you can also name series

s = pd.Series(np.random.randn(5), name='something')
s


0    2.057992
1   -0.069739
2   -0.031308
3   -0.196278
4    1.354640
Name: something, dtype: float64

In [23]:
s.name

'something'

# Dataframes

In [27]:
## dataframes are 2-D and labelled, like a spreadsheet, or a dict of series

d = {'one': pd.Series([1., 2., 3.], index=['a', 'b', 'c']),
     'two': pd.Series([1., 2., 3., 4.], index=['a', 'b', 'c', 'd'])}

df = pd.DataFrame(d)
df

Unnamed: 0,one,two
a,1.0,1.0
b,2.0,2.0
c,3.0,3.0
d,,4.0


In [28]:
pd.DataFrame(d, index=['d', 'b', 'a'])

Unnamed: 0,one,two
d,,4.0
b,2.0,2.0
a,1.0,1.0


In [32]:
pd.DataFrame(d, index=['d', 'b', 'a'], columns=['two', 'three'])
## When data is a dictionary, and a columns is not passed, 
## the DataFrame columns will be ordered by the dictionary’s insertion order. 

Unnamed: 0,two,three
d,4.0,
b,2.0,
a,1.0,


A dictionary of Series plus a specific index and/or columns will discard all data not matching to the passed index and/or columns.

In [33]:
d = {'one': [1., 2., 3., 4.],
         'two': [4., 3., 2., 1.]}

In [34]:
pd.DataFrame(d)

Unnamed: 0,one,two
0,1.0,4.0
1,2.0,3.0
2,3.0,2.0
3,4.0,1.0


In [35]:
pd.DataFrame(d, index=['a', 'b', 'c', 'd'])

Unnamed: 0,one,two
a,1.0,4.0
b,2.0,3.0
c,3.0,2.0
d,4.0,1.0


In [37]:
pd.DataFrame(pd.Series(np.random.randn(5), name='something'))

Unnamed: 0,something
0,0.907671
1,-0.106011
2,-0.038921
3,0.052476
4,-1.066019


In [38]:
df['one']

a    1.0
b    2.0
c    3.0
d    NaN
Name: one, dtype: float64

In [40]:
df['three'] = df['one'] * df['two']

df['flag'] = df['one'] > 2
df

Unnamed: 0,one,two,three,flag
a,1.0,1.0,1.0,False
b,2.0,2.0,4.0,False
c,3.0,3.0,9.0,True
d,,4.0,,False


In [41]:
del df['two']

In [42]:
## When inserting a scalar value, it goes down the whole column

df['foo'] = 'bar'
df

Unnamed: 0,one,three,flag,foo
a,1.0,1.0,False,bar
b,2.0,4.0,False,bar
c,3.0,9.0,True,bar
d,,,False,bar


In [44]:
df['one_trunc'] = df['one'][:2]
df

Unnamed: 0,one,three,flag,foo,one_trunc
a,1.0,1.0,False,bar,1.0
b,2.0,4.0,False,bar,2.0
c,3.0,9.0,True,bar,
d,,,False,bar,


In [48]:
df = pd.DataFrame(np.random.rand(8,3), columns=list('ABC'))
df * 5 + 2

Unnamed: 0,A,B,C
0,6.524992,6.767855,2.171875
1,3.450874,4.879685,3.651127
2,5.761121,6.571458,6.086737
3,5.954875,6.042768,5.980533
4,5.636579,4.700132,2.62002
5,2.762348,4.010387,5.816246
6,5.91299,3.924062,5.407186
7,4.030314,6.351857,3.818958


# dtypes

In [49]:
dft = pd.DataFrame({'A': np.random.rand(3),
                        'B': 1,
                        'C': 'foo',
                        'D': pd.Timestamp('20010102'),
                        'E': pd.Series([1.0] * 3).astype('float32'),
                        'F': False,
                        'G': pd.Series([1] * 3, dtype='int8')})
dft

Unnamed: 0,A,B,C,D,E,F,G
0,0.873216,1,foo,2001-01-02,1.0,False,1
1,0.124347,1,foo,2001-01-02,1.0,False,1
2,0.087905,1,foo,2001-01-02,1.0,False,1


In [50]:
dft.dtypes

A           float64
B             int64
C            object
D    datetime64[ns]
E           float32
F              bool
G              int8
dtype: object

In [51]:
dft['A'].dtype

dtype('float64')

In [52]:
 pd.Series([1, 2, 3, 6., 'foo'])

0      1
1      2
2      3
3    6.0
4    foo
dtype: object

In [57]:
df1 = pd.DataFrame(np.random.randn(8,1), columns=['A'], dtype='float32')
df1.dtypes
df1 = df1.astype('float64')
df1.dtypes

A    float64
dtype: object

In [59]:
# Convert certain columns to a specific dtype by passing a dict to astype().
dft1 = pd.DataFrame({'a': [1, 0, 1], 'b': [4, 5, 6], 'c': [7, 8, 9]})
dft1 = dft1.astype({'a': bool, 'c': np.float64})


Unnamed: 0,a,b,c
0,True,4,7.0
1,False,5,8.0
2,True,6,9.0
