In [234]:
# Pandas Web User Guide Follow along
# Working with text data
# https://pandas.pydata.org/docs/user_guide/text.html
# Created 11/28/20

%matplotlib notebook

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from io import StringIO

# How interactive you want is discussed:
# https://ipython.readthedocs.io/en/stable/config/options/terminal.html
# Options are: 'all', 'last', 'last_expr', 'none', 'last_expr_or_assign'
# Default is: 'last_expr'

from IPython.core.interactiveshell import InteractiveShell
# InteractiveShell.ast_node_interactivity I can only get last_expr_or_assign to work
InteractiveShell.ast_node_interactivity = "last_expr_or_assign"

In [235]:
def diag(*args):
    """Pandas diagnostics"""
    
    for i in args:
        
        if isinstance(i, pd.core.frame.DataFrame):
            print(i.info())
            display(i)
        else:
            print(f'{"-"*40}')
            print(f'Type: {type(i)}')

            try:
                print(f'Length: {len(i)}')
            except:
                pass

            try:
                print(i.info())
            except:
                pass

            try:
                display(i)
            except:
                print(i)
                
z = diag
d = display;

In [236]:
def read_df(text):
    """Create a pandas dataframe from a string of a dataframe
    copied from the pandas website tutorial."""
    lines = text.split('\n')
    cols = lines[0].split()
    index, array = [], []
    for line in lines[1:]:
        vals = line.split()
        index.append(vals[0])
        array.append(vals[1:])
#     print(cols)
#     print(index)
#     print(array)
    df = pd.DataFrame(array, index=index, columns=cols)
    return df

In [6]:
a = pd.Series(['a', 'b', 'c'])
b = pd.Series(['a', 'b', 'c'], dtype="string")
z(a, b)

----------------------------------------
Type: <class 'pandas.core.series.Series'>
Length: 3


0    a
1    b
2    c
dtype: object

----------------------------------------
Type: <class 'pandas.core.series.Series'>
Length: 3


0    a
1    b
2    c
dtype: string

In [7]:
s = pd.Series(['a', 'b', 'c'])

0    a
1    b
2    c
dtype: object

In [8]:
s.astype("string")

0    a
1    b
2    c
dtype: string

In [9]:
s = pd.Series(['a', 2, np.nan], dtype="string")

0       a
1       2
2    <NA>
dtype: string

In [10]:
arr = pd.array([1, 2, None], dtype=pd.Int64Dtype())

<IntegerArray>
[1, 2, <NA>]
Length: 3, dtype: Int64

In [11]:
df = pd.DataFrame({'A': s, 'B': [1, 1, 3], 'C': list('aab')})

Unnamed: 0,A,B,C
0,a,1,a
1,2,1,a
2,,3,b


In [12]:
df['A']

0       a
1       2
2    <NA>
Name: A, dtype: string

In [16]:
pd.concat([df['A'], df[['B', 'C']]], axis=1)

Unnamed: 0,A,B,C
0,a,1,a
1,2,1,a
2,,3,b


In [20]:
pd.concat([df.loc[:, ['A']], df.loc[:, ['B', 'C']]], axis=1)

Unnamed: 0,A,B,C
0,a,1,a
1,2,1,a
2,,3,b


In [21]:
a = pd.array([1, None], dtype="Int64")
b = a[1]
z(a, b)

----------------------------------------
Type: <class 'pandas.core.arrays.integer.IntegerArray'>
Length: 2


<IntegerArray>
[1, <NA>]
Length: 2, dtype: Int64

----------------------------------------
Type: <class 'pandas._libs.missing.NAType'>


<NA>

In [22]:
dir(b)

['_HANDLED_TYPES',
 '__abs__',
 '__add__',
 '__and__',
 '__array_priority__',
 '__array_ufunc__',
 '__bool__',
 '__class__',
 '__delattr__',
 '__dict__',
 '__dir__',
 '__divmod__',
 '__doc__',
 '__eq__',
 '__floordiv__',
 '__format__',
 '__ge__',
 '__getattribute__',
 '__gt__',
 '__hash__',
 '__init__',
 '__init_subclass__',
 '__invert__',
 '__le__',
 '__lt__',
 '__matmul__',
 '__mod__',
 '__module__',
 '__mul__',
 '__ne__',
 '__neg__',
 '__new__',
 '__or__',
 '__pos__',
 '__pow__',
 '__radd__',
 '__rand__',
 '__rdivmod__',
 '__reduce__',
 '__reduce_ex__',
 '__repr__',
 '__rfloordiv__',
 '__rmatmul__',
 '__rmod__',
 '__rmul__',
 '__ror__',
 '__rpow__',
 '__rsub__',
 '__rtruediv__',
 '__rxor__',
 '__setattr__',
 '__setstate__',
 '__sizeof__',
 '__str__',
 '__sub__',
 '__subclasshook__',
 '__truediv__',
 '__weakref__',
 '__xor__',
 '_instance']

In [23]:
s = pd.Series([1, 2, 3])

0    1
1    2
2    3
dtype: int64

In [24]:
mask = pd.array([True, False, pd.NA], dtype="boolean")

<BooleanArray>
[True, False, <NA>]
Length: 3, dtype: boolean

In [25]:
s[mask]

0    1
dtype: int64

In [26]:
pd.Series([True, False, np.nan], dtype="object") | True

0     True
1     True
2    False
dtype: bool

In [27]:
pd.Series([True, False, np.nan], dtype="boolean") | True

0    True
1    True
2    True
dtype: boolean

In [28]:
pd.Series([True, False, np.nan], dtype="object") & True

0     True
1    False
2    False
dtype: bool

In [29]:
pd.Series([True, False, np.nan], dtype="boolean") & True

0     True
1    False
2     <NA>
dtype: boolean

In [35]:
data = [True, False, np.nan]

pd_o_series = pd.Series(a, dtype="object")
pd_b_series = pd.Series(a, dtype="boolean")
np_o_series = np.array(a, dtype="object")
# np_b_series = np.array(a, dtype=bool)
z(data, pd_o_series, pd_b_series, np_o_series, np_b_series)

----------------------------------------
Type: <class 'list'>
Length: 3


[True, False, nan]

----------------------------------------
Type: <class 'pandas.core.series.Series'>
Length: 2


0       1
1    <NA>
dtype: object

----------------------------------------
Type: <class 'pandas.core.series.Series'>
Length: 2


0    True
1    <NA>
dtype: boolean

----------------------------------------
Type: <class 'numpy.ndarray'>
Length: 2


array([1, <NA>], dtype=object)

----------------------------------------
Type: <class 'numpy.ndarray'>
Length: 3


array([ True, False,  True])

In [32]:
np_b_series

array([ True, False,  True])

In [36]:
np_b_series = np.array([True, False, np.nan], dtype=bool)

array([ True, False,  True])

In [37]:
s = pd.Series(["a", None, "b"], dtype="string")

0       a
1    <NA>
2       b
dtype: string

In [38]:
type(s[1])

pandas._libs.missing.NAType

In [39]:
 s.str.count("a")

0       1
1    <NA>
2       0
dtype: Int64

In [40]:
sum( s.str.count("a"))

<NA>

In [41]:
s.dropna()

0    a
2    b
dtype: string

In [44]:
s.dropna().value_counts()

b    1
a    1
dtype: Int64

In [50]:
s2 = pd.Series(["abba", None, "b"], dtype="object")

0    abba
1    None
2       b
dtype: object

In [51]:
s2.str.count("a")

0    2.0
1    NaN
2    0.0
dtype: float64

In [52]:
sum(s2.str.count("a"))

nan

In [53]:
s2.dropna().value_counts()

b       1
abba    1
dtype: int64

In [54]:
s2.dropna().str.count("a")

0    2
2    0
dtype: int64

In [57]:
s = pd.Series(['A', 'B', 'C', 'Aaba', 'Baca', np.nan, 'CABA', 'dog', 'cat'],
             dtype="string")
s2 = pd.Series(['A', 'B', 'C', 'Aaba', 'Baca', np.nan, 'CABA', 'dog', 'cat'],)
z(s, s2)

----------------------------------------
Type: <class 'pandas.core.series.Series'>
Length: 9


0       A
1       B
2       C
3    Aaba
4    Baca
5    <NA>
6    CABA
7     dog
8     cat
dtype: string

----------------------------------------
Type: <class 'pandas.core.series.Series'>
Length: 9


0       A
1       B
2       C
3    Aaba
4    Baca
5     NaN
6    CABA
7     dog
8     cat
dtype: object

In [58]:
s.str.lower()

0       a
1       b
2       c
3    aaba
4    baca
5    <NA>
6    caba
7     dog
8     cat
dtype: string

In [59]:
s.str.len()

0       1
1       1
2       1
3       4
4       4
5    <NA>
6       4
7       3
8       3
dtype: Int64

In [61]:
df = pd.DataFrame(np.random.randn(3, 2),
                  columns=[' Column A ', ' Column B '], index=range(3))
z(df)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3 entries, 0 to 2
Data columns (total 2 columns):
 #   Column      Non-Null Count  Dtype  
---  ------      --------------  -----  
 0    Column A   3 non-null      float64
 1    Column B   3 non-null      float64
dtypes: float64(2)
memory usage: 112.0 bytes
None


Unnamed: 0,Column A,Column B
0,-1.387667,-1.009521
1,0.166759,0.035338
2,-0.421559,-0.981497


In [72]:
df.columns = df.columns.str.strip().str.lower().str.replace(' ', '_')

In [73]:
df.columns

Index(['column_a', 'column_b'], dtype='object')

In [74]:
s2 = pd.Series(['a_b_c', 'c_d_e', np.nan, 'f_g_h'], dtype="string")

0    a_b_c
1    c_d_e
2     <NA>
3    f_g_h
dtype: string

In [75]:
s2.str.split('_')

0    [a, b, c]
1    [c, d, e]
2         <NA>
3    [f, g, h]
dtype: object

In [78]:
s3 = s2.str.split('_', expand=True)
z(s3)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4 entries, 0 to 3
Data columns (total 3 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   0       3 non-null      string
 1   1       3 non-null      string
 2   2       3 non-null      string
dtypes: string(3)
memory usage: 112.0 bytes
None


Unnamed: 0,0,1,2
0,a,b,c
1,c,d,e
2,,,
3,f,g,h


In [79]:
s3 = pd.Series(['A', 'B', 'C', 'Aaba', 'Baca',
                '', np.nan, 'CABA', 'dog', 'cat'],
                dtype="string") 

0       A
1       B
2       C
3    Aaba
4    Baca
5        
6    <NA>
7    CABA
8     dog
9     cat
dtype: string

In [80]:
s3.str.replace('^.a|dog', 'XX-XX ', case=False)

0           A
1           B
2           C
3    XX-XX ba
4    XX-XX ca
5            
6        <NA>
7    XX-XX BA
8      XX-XX 
9     XX-XX t
dtype: string

In [81]:
s = pd.Series(['a', 'b', 'c', 'd'], dtype="string")
s.str.cat(sep=',')

'a,b,c,d'

In [84]:
 s.str.cat(['A', 'B', 'C', 'D'])

0    aA
1    bB
2    cC
3    dD
dtype: string

In [85]:
t

NameError: name 't' is not defined

In [86]:
t = pd.Series(['a', 'b', np.nan, 'd'], dtype="string")

0       a
1       b
2    <NA>
3       d
dtype: string

In [87]:
s.str.cat(t)

0      aa
1      bb
2    <NA>
3      dd
dtype: string

In [88]:
s.str.cat(t, na_rep='-')

0    aa
1    bb
2    c-
3    dd
dtype: string

In [89]:
z(t, s)

----------------------------------------
Type: <class 'pandas.core.series.Series'>
Length: 4


0       a
1       b
2    <NA>
3       d
dtype: string

----------------------------------------
Type: <class 'pandas.core.series.Series'>
Length: 4


0    a
1    b
2    c
3    d
dtype: string

In [90]:
d = pd.concat([t, s], axis=1)

Unnamed: 0,0,1
0,a,a
1,b,b
2,,c
3,d,d


In [91]:
u = pd.Series(['b', 'd', 'a', 'c'], index=[1, 3, 0, 2],
              dtype="string")

1    b
3    d
0    a
2    c
dtype: string

In [92]:
s.str.cat(u)

0    aa
1    bb
2    cc
3    dd
dtype: string

In [98]:
u.reset_index(drop=True)

0    b
1    d
2    a
3    c
dtype: string

In [99]:
s.str.cat(u.reset_index(drop=True))

0    ab
1    bd
2    ca
3    dc
dtype: string

In [100]:
s.str.cat(u)

0    aa
1    bb
2    cc
3    dd
dtype: string

In [101]:
s.str.cat(u, join='left')

0    aa
1    bb
2    cc
3    dd
dtype: string

In [102]:
v = pd.Series(['z', 'a', 'b', 'd', 'e'], index=[-1, 0, 1, 3, 4],
            dtype="string")

-1    z
 0    a
 1    b
 3    d
 4    e
dtype: string

In [103]:
s

0    a
1    b
2    c
3    d
dtype: string

In [105]:
s.str.cat(v, join='left', na_rep='-')

0    aa
1    bb
2    c-
3    dd
dtype: string

In [106]:
s.str.cat(v, join='outer', na_rep='-')

-1    -z
 0    aa
 1    bb
 2    c-
 3    dd
 4    -e
dtype: string

In [107]:
d

Unnamed: 0,0,1
0,a,a
1,b,b
2,,c
3,d,d


In [108]:
d.loc[[3, 2, 1, 0], :]

Unnamed: 0,0,1
3,d,d
2,,c
1,b,b
0,a,a


In [109]:
f = d.loc[[3, 2, 1, 0], :]

Unnamed: 0,0,1
3,d,d
2,,c
1,b,b
0,a,a


In [110]:
s.str.cat(f, join='left', na_rep='-')

0    aaa
1    bbb
2    c-c
3    ddd
dtype: string

In [111]:
s = pd.Series(['A', 'B', 'C', 'Aaba', 'Baca', np.nan,
               'CABA', 'dog', 'cat'],
                dtype="string")

0       A
1       B
2       C
3    Aaba
4    Baca
5    <NA>
6    CABA
7     dog
8     cat
dtype: string

In [116]:
s.str[-1::-1]

0       A
1       B
2       C
3    abaA
4    acaB
5    <NA>
6    ABAC
7     god
8     tac
dtype: string

In [117]:
s = pd.Series(['a', 'a|b', np.nan, 'a|c'], dtype="string")

0       a
1     a|b
2    <NA>
3     a|c
dtype: string

In [121]:
df = pd.DataFrame(np.random.randn(5, 3), index=['a', 'c', 'e', 'f', 'h'],
              columns=['one', 'two', 'three'])

Unnamed: 0,one,two,three
a,0.371024,1.949661,0.120921
c,-0.008251,1.34119,0.373918
e,-0.569968,2.812902,-1.820883
f,1.947023,0.275765,0.03545
h,0.53317,-0.37884,-0.137854


In [122]:
df['four'] = 'bar'
df['five'] = df['one'] > 0
df

Unnamed: 0,one,two,three,four,five
a,0.371024,1.949661,0.120921,bar,True
c,-0.008251,1.34119,0.373918,bar,False
e,-0.569968,2.812902,-1.820883,bar,False
f,1.947023,0.275765,0.03545,bar,True
h,0.53317,-0.37884,-0.137854,bar,True


In [127]:
df2 = df.reindex(['a', 'b', 'c', 'd', 'e', 'f', 'g', 'h'])
df3 = df2.copy()

Unnamed: 0,one,two,three,four,five
a,0.371024,1.949661,0.120921,bar,True
b,,,,,
c,-0.008251,1.34119,0.373918,bar,False
d,,,,,
e,-0.569968,2.812902,-1.820883,bar,False
f,1.947023,0.275765,0.03545,bar,True
g,,,,,
h,0.53317,-0.37884,-0.137854,bar,True


In [128]:
df3['one_isna'] = df2['one'].isna()

In [129]:
df3['one_notna'] = df2['one'].notna()

In [130]:
df3

Unnamed: 0,one,two,three,four,five,one_isna,one_notna
a,0.371024,1.949661,0.120921,bar,True,False,True
b,,,,,,True,False
c,-0.008251,1.34119,0.373918,bar,False,False,True
d,,,,,,True,False
e,-0.569968,2.812902,-1.820883,bar,False,False,True
f,1.947023,0.275765,0.03545,bar,True,False,True
g,,,,,,True,False
h,0.53317,-0.37884,-0.137854,bar,True,False,True


In [131]:
df3['_and_'] = df3['one_isna'] & df3['one_notna']
df3['_or_'] = df3['one_isna'] | df3['one_notna']
df3

Unnamed: 0,one,two,three,four,five,one_isna,one_notna,_and_,_or_
a,0.371024,1.949661,0.120921,bar,True,False,True,False,True
b,,,,,,True,False,False,True
c,-0.008251,1.34119,0.373918,bar,False,False,True,False,True
d,,,,,,True,False,False,True
e,-0.569968,2.812902,-1.820883,bar,False,False,True,False,True
f,1.947023,0.275765,0.03545,bar,True,False,True,False,True
g,,,,,,True,False,False,True
h,0.53317,-0.37884,-0.137854,bar,True,False,True,False,True


In [132]:
df2.isna()

Unnamed: 0,one,two,three,four,five
a,False,False,False,False,False
b,True,True,True,True,True
c,False,False,False,False,False
d,True,True,True,True,True
e,False,False,False,False,False
f,False,False,False,False,False
g,True,True,True,True,True
h,False,False,False,False,False


In [133]:
None == None

True

In [134]:
np.nan == np.nan

False

In [135]:
np.nan is np.nan

True

In [137]:
a = pd.Series([1, 2, np.nan, 4], dtype=pd.Int64Dtype())

0       1
1       2
2    <NA>
3       4
dtype: Int64

In [138]:
a[2]

<NA>

In [141]:
b = a[2]
print(type(b))
dir(b)

<class 'pandas._libs.missing.NAType'>


['_HANDLED_TYPES',
 '__abs__',
 '__add__',
 '__and__',
 '__array_priority__',
 '__array_ufunc__',
 '__bool__',
 '__class__',
 '__delattr__',
 '__dict__',
 '__dir__',
 '__divmod__',
 '__doc__',
 '__eq__',
 '__floordiv__',
 '__format__',
 '__ge__',
 '__getattribute__',
 '__gt__',
 '__hash__',
 '__init__',
 '__init_subclass__',
 '__invert__',
 '__le__',
 '__lt__',
 '__matmul__',
 '__mod__',
 '__module__',
 '__mul__',
 '__ne__',
 '__neg__',
 '__new__',
 '__or__',
 '__pos__',
 '__pow__',
 '__radd__',
 '__rand__',
 '__rdivmod__',
 '__reduce__',
 '__reduce_ex__',
 '__repr__',
 '__rfloordiv__',
 '__rmatmul__',
 '__rmod__',
 '__rmul__',
 '__ror__',
 '__rpow__',
 '__rsub__',
 '__rtruediv__',
 '__rxor__',
 '__setattr__',
 '__setstate__',
 '__sizeof__',
 '__str__',
 '__sub__',
 '__subclasshook__',
 '__truediv__',
 '__weakref__',
 '__xor__',
 '_instance']

In [142]:
df2 = df.copy()

Unnamed: 0,one,two,three,four,five
a,0.371024,1.949661,0.120921,bar,True
c,-0.008251,1.34119,0.373918,bar,False
e,-0.569968,2.812902,-1.820883,bar,False
f,1.947023,0.275765,0.03545,bar,True
h,0.53317,-0.37884,-0.137854,bar,True


In [146]:
df2['timestamp'] = pd.Timestamp('20120101')
z(df2)

<class 'pandas.core.frame.DataFrame'>
Index: 5 entries, a to h
Data columns (total 6 columns):
 #   Column     Non-Null Count  Dtype         
---  ------     --------------  -----         
 0   one        5 non-null      float64       
 1   two        5 non-null      float64       
 2   three      5 non-null      float64       
 3   four       5 non-null      object        
 4   five       5 non-null      bool          
 5   timestamp  5 non-null      datetime64[ns]
dtypes: bool(1), datetime64[ns](1), float64(3), object(1)
memory usage: 301.0+ bytes
None


Unnamed: 0,one,two,three,four,five,timestamp
a,0.371024,1.949661,0.120921,bar,True,2012-01-01
c,-0.008251,1.34119,0.373918,bar,False,2012-01-01
e,-0.569968,2.812902,-1.820883,bar,False,2012-01-01
f,1.947023,0.275765,0.03545,bar,True,2012-01-01
h,0.53317,-0.37884,-0.137854,bar,True,2012-01-01


In [147]:
df2.loc[['a', 'c', 'h'], ['one', 'timestamp']] = np.nan

In [150]:
d(df2)

Unnamed: 0,one,two,three,four,five,timestamp
a,,1.949661,0.120921,bar,True,NaT
c,,1.34119,0.373918,bar,False,NaT
e,-0.569968,2.812902,-1.820883,bar,False,2012-01-01
f,1.947023,0.275765,0.03545,bar,True,2012-01-01
h,,-0.37884,-0.137854,bar,True,NaT


In [151]:
df2.dtypes.value_counts()

float64           3
object            1
bool              1
datetime64[ns]    1
dtype: int64

In [152]:
a

0       1
1       2
2    <NA>
3       4
dtype: Int64

In [153]:
t= """        one       two
a       NaN -0.282863
c       NaN  1.212112
e  0.119209 -1.044236
f -2.104569 -0.494929
h -2.104569 -0.706771"""
a = read_df(t)

Unnamed: 0,one,two
a,,-0.282863
c,,1.212112
e,0.119209,-1.044236
f,-2.104569,-0.494929
h,-2.104569,-0.706771


In [155]:
t2 = """        one       two     three
a       NaN -0.282863 -1.509059
c       NaN  1.212112 -0.173215
e  0.119209 -1.044236 -0.861849
f -2.104569 -0.494929  1.071804
h       NaN -0.706771 -1.039575"""
b = read_df(t2)

Unnamed: 0,one,two,three
a,,-0.282863,-1.509059
c,,1.212112,-0.173215
e,0.119209,-1.044236,-0.861849
f,-2.104569,-0.494929,1.071804
h,,-0.706771,-1.039575


In [156]:
df

Unnamed: 0,one,two,three,four,five
a,0.371024,1.949661,0.120921,bar,True
c,-0.008251,1.34119,0.373918,bar,False
e,-0.569968,2.812902,-1.820883,bar,False
f,1.947023,0.275765,0.03545,bar,True
h,0.53317,-0.37884,-0.137854,bar,True


In [157]:
t3="""        one       two     three
a       NaN -0.282863 -1.509059
c       NaN  1.212112 -0.173215
e  0.119209 -1.044236 -0.861849
f -2.104569 -0.494929  1.071804
h       NaN -0.706771 -1.039575"""
df = read_df(t3)

Unnamed: 0,one,two,three
a,,-0.282863,-1.509059
c,,1.212112,-0.173215
e,0.119209,-1.044236,-0.861849
f,-2.104569,-0.494929,1.071804
h,,-0.706771,-1.039575


In [159]:
z(df)

<class 'pandas.core.frame.DataFrame'>
Index: 5 entries, a to h
Data columns (total 3 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   one     5 non-null      object
 1   two     5 non-null      object
 2   three   5 non-null      object
dtypes: object(3)
memory usage: 80.0+ bytes
None


Unnamed: 0,one,two,three
a,,-0.282863,-1.509059
c,,1.212112,-0.173215
e,0.119209,-1.044236,-0.861849
f,-2.104569,-0.494929,1.071804
h,,-0.706771,-1.039575


In [160]:
df = df.astype(float)

Unnamed: 0,one,two,three
a,,-0.282863,-1.509059
c,,1.212112,-0.173215
e,0.119209,-1.044236,-0.861849
f,-2.104569,-0.494929,1.071804
h,,-0.706771,-1.039575


In [161]:
df['one'].sum()

-1.9853600000000002

In [162]:
df.mean(1)

a   -0.895961
c    0.519448
e   -0.595625
f   -0.509231
h   -0.873173
dtype: float64

In [163]:
df.cumsum()

Unnamed: 0,one,two,three
a,,-0.282863,-1.509059
c,,0.929249,-1.682274
e,0.119209,-0.114987,-2.544123
f,-1.98536,-0.609916,-1.472319
h,,-1.316687,-2.511894


In [164]:
df2 = df.copy()
df2.loc['e', 'two'] = np.nan

In [166]:
df2.cumsum()

Unnamed: 0,one,two,three
a,,-0.282863,-1.509059
c,,0.929249,-1.682274
e,0.119209,,-2.544123
f,-1.98536,0.43432,-1.472319
h,,-0.272451,-2.511894


In [167]:
df2.cumsum(skipna=False)

Unnamed: 0,one,two,three
a,,-0.282863,-1.509059
c,,0.929249,-1.682274
e,,,-2.544123
f,,,-1.472319
h,,,-2.511894


In [168]:
dff = pd.DataFrame(np.random.randn(10, 3), columns=list('ABC'))

Unnamed: 0,A,B,C
0,-1.511898,-0.11407,-1.436639
1,0.429865,-0.570523,-0.679251
2,0.152175,1.915974,-0.578704
3,-1.404176,-0.089221,0.039096
4,-0.627957,0.275054,0.590804
5,-0.348029,0.2923,1.266769
6,0.419644,-0.576055,-1.098867
7,0.620062,-1.594184,0.868699
8,-0.105717,-0.070429,0.832429
9,-0.185903,0.505295,-0.718369


In [169]:
dff.iloc[3:5, 0] = np.nan
dff.iloc[4:6, 1] = np.nan
dff.iloc[5:8, 2] = np.nan
z(dff)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10 entries, 0 to 9
Data columns (total 3 columns):
 #   Column  Non-Null Count  Dtype  
---  ------  --------------  -----  
 0   A       8 non-null      float64
 1   B       8 non-null      float64
 2   C       7 non-null      float64
dtypes: float64(3)
memory usage: 304.0 bytes
None


Unnamed: 0,A,B,C
0,-1.511898,-0.11407,-1.436639
1,0.429865,-0.570523,-0.679251
2,0.152175,1.915974,-0.578704
3,,-0.089221,0.039096
4,,,0.590804
5,-0.348029,,
6,0.419644,-0.576055,
7,0.620062,-1.594184,
8,-0.105717,-0.070429,0.832429
9,-0.185903,0.505295,-0.718369


In [170]:
dff.mean()

A   -0.066225
B   -0.074152
C   -0.278662
dtype: float64

In [171]:
pd.notna(dff)

Unnamed: 0,A,B,C
0,True,True,True
1,True,True,True
2,True,True,True
3,False,True,True
4,False,False,True
5,True,False,False
6,True,True,False
7,True,True,False
8,True,True,True
9,True,True,True


In [172]:
ts

NameError: name 'ts' is not defined

In [173]:
d = {'a': list(range(4)), 'b': list('ab..'), 'c': ['a', 'b', np.nan, 'd']}

{'a': [0, 1, 2, 3], 'b': ['a', 'b', '.', '.'], 'c': ['a', 'b', nan, 'd']}

In [174]:
df = pd.DataFrame(d)

Unnamed: 0,a,b,c
0,0,a,a
1,1,b,b
2,2,.,
3,3,.,d


In [175]:
df.replace('.', np.nan)

Unnamed: 0,a,b,c
0,0,a,a
1,1,b,b
2,2,,
3,3,,d


In [189]:
df = pd.DataFrame(np.random.randn(10, 2))
df[np.random.rand(df.shape[0]) > 0.5] = 1.5
df

Unnamed: 0,0,1
0,1.5,1.5
1,1.5,1.5
2,1.5,1.5
3,0.736893,-0.653446
4,1.5,1.5
5,1.5,1.5
6,-2.30959,2.529625
7,-0.334861,-0.787363
8,1.5,1.5
9,-0.918605,-0.732829


In [190]:
df.replace(1.5, np.nan)

Unnamed: 0,0,1
0,,
1,,
2,,
3,0.736893,-0.653446
4,,
5,,
6,-2.30959,2.529625
7,-0.334861,-0.787363
8,,
9,-0.918605,-0.732829


In [193]:
df00 = df.iloc[9, 0]

-0.9186045251488572

In [194]:
df.replace([1.5, df00], [np.nan, 'a'])

Unnamed: 0,0,1
0,,
1,,
2,,
3,0.736893,-0.653446
4,,
5,,
6,-2.30959,2.52963
7,-0.334861,-0.787363
8,,
9,a,-0.732829


In [196]:
s = pd.Series(np.random.randn(5), index=[0, 2, 4, 6, 7])
z(s)

----------------------------------------
Type: <class 'pandas.core.series.Series'>
Length: 5


0   -0.394024
2   -1.517913
4   -0.385972
6   -1.711875
7    1.401926
dtype: float64

In [197]:
s > 0

0    False
2    False
4    False
6    False
7     True
dtype: bool

In [198]:
(s > 0).dtype

dtype('bool')

In [199]:
crit = (s > 0).reindex(list(range(8)))

0    False
1      NaN
2    False
3      NaN
4    False
5      NaN
6    False
7     True
dtype: object

In [200]:
crit.dtype

dtype('O')

In [201]:
crit.fillna(False)

0    False
1    False
2    False
3    False
4    False
5    False
6    False
7     True
dtype: bool

In [202]:
reindexed

NameError: name 'reindexed' is not defined

In [203]:
reindexed = s.reindex(list(range(8))).fillna(0)

0   -0.394024
1    0.000000
2   -1.517913
3    0.000000
4   -0.385972
5    0.000000
6   -1.711875
7    1.401926
dtype: float64

In [204]:
reindexed[crit.fillna(False)]

7    1.401926
dtype: float64

In [205]:
reindexed[crit.fillna(True)]

1    0.000000
3    0.000000
5    0.000000
7    1.401926
dtype: float64

In [206]:
s = pd.Series(["a", "b", "c", "a"], dtype="category")

0    a
1    b
2    c
3    a
dtype: category
Categories (3, object): ['a', 'b', 'c']

In [209]:
df = pd.DataFrame({"A": ["a", "b", "c", "a"]})
df["B"] = df["A"].astype('category')
z(df)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4 entries, 0 to 3
Data columns (total 2 columns):
 #   Column  Non-Null Count  Dtype   
---  ------  --------------  -----   
 0   A       4 non-null      object  
 1   B       4 non-null      category
dtypes: category(1), object(1)
memory usage: 144.0+ bytes
None


Unnamed: 0,A,B
0,a,a
1,b,b
2,c,c
3,a,a


In [210]:
labels = ["{0} - {1}".format(i, i + 9) for i in range(0, 100, 10)]

['0 - 9',
 '10 - 19',
 '20 - 29',
 '30 - 39',
 '40 - 49',
 '50 - 59',
 '60 - 69',
 '70 - 79',
 '80 - 89',
 '90 - 99']

In [211]:
df = pd.DataFrame({'value': np.random.randint(0, 100, 20)})

Unnamed: 0,value
0,69
1,25
2,78
3,52
4,98
5,14
6,45
7,2
8,46
9,90


In [212]:
df['group'] = pd.cut(df.value, range(0, 105, 10), right=False, labels=labels)
z(df)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 20 entries, 0 to 19
Data columns (total 2 columns):
 #   Column  Non-Null Count  Dtype   
---  ------  --------------  -----   
 0   value   20 non-null     int32   
 1   group   20 non-null     category
dtypes: category(1), int32(1)
memory usage: 396.0 bytes
None


Unnamed: 0,value,group
0,69,60 - 69
1,25,20 - 29
2,78,70 - 79
3,52,50 - 59
4,98,90 - 99
5,14,10 - 19
6,45,40 - 49
7,2,0 - 9
8,46,40 - 49
9,90,90 - 99


In [213]:
raw_cat = pd.Categorical(["a", "b", "c", "a"], categories=["b", "c", "d"],
                        ordered=False)

[NaN, 'b', 'c', NaN]
Categories (3, object): ['b', 'c', 'd']

In [214]:
s = pd.Series(raw_cat)

0    NaN
1      b
2      c
3    NaN
dtype: category
Categories (3, object): ['b', 'c', 'd']

In [215]:
df = pd.DataFrame({"A": ["a", "b", "c", "a"]})

Unnamed: 0,A
0,a
1,b
2,c
3,a


In [216]:
df["B"] = raw_cat

In [217]:
r5 = list(range(4))
r5_r = r5[::-1]
a = pd.DataFrame(r5, index=r5_r)

Unnamed: 0,0
3,0
2,1
1,2
0,3


In [221]:
a['B'] = r5


Unnamed: 0,0,B
3,0,0
2,1,1
1,2,2
0,3,3


In [220]:
a.reset_index()

Unnamed: 0,index,0,B
0,3,0,0
1,2,1,1
2,1,2,2
3,0,3,3


In [227]:
print(a, '\n', r5)

   0  B
3  0  0
2  1  1
1  2  2
0  3  3 
 [0, 1, 2, 3]


In [230]:
renamer = {a: b for a, b in zip(r5, r5_r) }

{0: 3, 1: 2, 2: 1, 3: 0}

In [240]:
a.rename(index=renamer, inplace=True)
d(a)

Unnamed: 0,0,B
3,0,0
2,1,1
1,2,2
0,3,3


In [242]:
text = list('abcd')

['a', 'b', 'c', 'd']

In [244]:
raw_cat = pd.Categorical(["a", "b", "c", "a"], categories=["b", "c", "d"],
                        ordered=False)

[NaN, 'b', 'c', NaN]
Categories (3, object): ['b', 'c', 'd']

In [245]:
z(raw_cat)

----------------------------------------
Type: <class 'pandas.core.arrays.categorical.Categorical'>
Length: 4


[NaN, 'b', 'c', NaN]
Categories (3, object): ['b', 'c', 'd']

In [246]:
s = pd.Series(raw_cat)

0    NaN
1      b
2      c
3    NaN
dtype: category
Categories (3, object): ['b', 'c', 'd']

In [247]:
from pandas.api.types import CategoricalDtype

In [248]:
s = pd.Series(["a", "b", "c", "a"])

0    a
1    b
2    c
3    a
dtype: object

In [249]:
cat_type = CategoricalDtype(categories=["b", "c", "d"],
                       ordered=True)

CategoricalDtype(categories=['b', 'c', 'd'], ordered=True)

In [250]:
 s_cat = s.astype(cat_type)

0    NaN
1      b
2      c
3    NaN
dtype: category
Categories (3, object): ['b' < 'c' < 'd']

In [251]:
splitter = np.random.choice([0, 1], 5, p=[0.5, 0.5])

array([0, 0, 1, 0, 0])

In [253]:
s = pd.Series(pd.Categorical.from_codes(splitter,
                   categories=["train", "test", "hey dummy"]))

0    train
1    train
2     test
3    train
4    train
dtype: category
Categories (3, object): ['train', 'test', 'hey dummy']

In [262]:
s = pd.Series(["a", "b", "c", 10])
s_o_dtype = s.dtype
d(s, s_o_dtype)

0     a
1     b
2     c
3    10
dtype: object

dtype('O')

In [263]:
s2 = s.astype('category')

0     a
1     b
2     c
3    10
dtype: category
Categories (4, object): [10, 'a', 'b', 'c']

In [264]:
s3 = s2.astype(s_o_dtype)

0     a
1     b
2     c
3    10
dtype: object

In [267]:
s4 = pd.to_numeric(s3, errors='coerce')

0     NaN
1     NaN
2     NaN
3    10.0
dtype: float64

In [268]:
cdt = CategoricalDtype()

CategoricalDtype(categories=None, ordered=False)

In [270]:
cat = pd.Categorical(["a", "c", "c", np.nan], categories=["b", "a", "c"])
df = pd.DataFrame({"cat": cat, "s": ["a", "c", "c", np.nan]})
z(df)
df.describe()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4 entries, 0 to 3
Data columns (total 2 columns):
 #   Column  Non-Null Count  Dtype   
---  ------  --------------  -----   
 0   cat     3 non-null      category
 1   s       3 non-null      object  
dtypes: category(1), object(1)
memory usage: 144.0+ bytes
None


Unnamed: 0,cat,s
0,a,a
1,c,c
2,c,c
3,,


Unnamed: 0,cat,s
count,3,3
unique,2,2
top,c,c
freq,2,2


In [271]:
df["cat"].describe()

count     3
unique    2
top       c
freq      2
Name: cat, dtype: object

In [272]:
s = pd.Series(["a", "b", "c", "a"], dtype="category")

0    a
1    b
2    c
3    a
dtype: category
Categories (3, object): ['a', 'b', 'c']

In [273]:
 s.cat.categories

Index(['a', 'b', 'c'], dtype='object')

In [274]:
s.cat.ordered

False

In [275]:
s = pd.Series(pd.Categorical(["a", "b", "c", "a"],
   ....:               categories=["c", "b", "a"]))

0    a
1    b
2    c
3    a
dtype: category
Categories (3, object): ['c', 'b', 'a']

In [276]:
s.cat.categories

Index(['c', 'b', 'a'], dtype='object')

In [277]:
s.cat.ordered

False

In [278]:
s = pd.Series(list('babc')).astype(CategoricalDtype(list('abcd')))

0    b
1    a
2    b
3    c
dtype: category
Categories (4, object): ['a', 'b', 'c', 'd']

In [279]:
# categories
s.cat.categories

Index(['a', 'b', 'c', 'd'], dtype='object')

In [280]:
# uniques
s.unique()

['b', 'a', 'c']
Categories (3, object): ['b', 'a', 'c']

In [281]:
s = pd.Series(["a", "b", "c", "a"], dtype="category")

0    a
1    b
2    c
3    a
dtype: category
Categories (3, object): ['a', 'b', 'c']

In [282]:
c = s.cat.categories

Index(['a', 'b', 'c'], dtype='object')

In [286]:
g = [f'Group {i.upper()}' for i in c]

['Group A', 'Group B', 'Group C']

In [287]:
s.cat.categories = g

In [288]:
s

0    Group A
1    Group B
2    Group C
3    Group A
dtype: category
Categories (3, object): ['Group A', 'Group B', 'Group C']

In [294]:
s2 = s.cat.rename_categories(['x', 'y', 'z'])

0    x
1    y
2    z
3    x
dtype: category
Categories (3, object): ['x', 'y', 'z']

In [298]:
s3 = s2.cat.remove_categories('z')
d(s2, s3)

0    x
1    y
2    z
3    x
dtype: category
Categories (3, object): ['x', 'y', 'z']

0      x
1      y
2    NaN
3      x
dtype: category
Categories (2, object): ['x', 'y']

In [299]:
dfs = pd.DataFrame({'A': pd.Categorical(list('bbeebbaa'),
   .....:                                         categories=['e', 'a', 'b'],
   .....:                                         ordered=True),
   .....:                     'B': [1, 2, 1, 2, 2, 1, 2, 1]})

Unnamed: 0,A,B
0,b,1
1,b,2
2,e,1
3,e,2
4,b,2
5,b,1
6,a,2
7,a,1


In [300]:
z(dfs)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 8 entries, 0 to 7
Data columns (total 2 columns):
 #   Column  Non-Null Count  Dtype   
---  ------  --------------  -----   
 0   A       8 non-null      category
 1   B       8 non-null      int64   
dtypes: category(1), int64(1)
memory usage: 196.0 bytes
None


Unnamed: 0,A,B
0,b,1
1,b,2
2,e,1
3,e,2
4,b,2
5,b,1
6,a,2
7,a,1


In [302]:
dfs['A'].cat.categories

Index(['e', 'a', 'b'], dtype='object')

In [303]:
cat = pd.Series([1, 2, 3]).astype(
   .....:     CategoricalDtype([3, 2, 1], ordered=True)
   .....: )
   .....: 

0    1
1    2
2    3
dtype: category
Categories (3, int64): [3 < 2 < 1]

In [304]:
cat_base = pd.Series([2, 2, 2]).astype(
   .....:     CategoricalDtype([3, 2, 1], ordered=True)
   .....: )
   .....: 

0    2
1    2
2    2
dtype: category
Categories (3, int64): [3 < 2 < 1]

In [306]:
cat_base2 = pd.Series([2, 2, 2]).astype(
   .....:     CategoricalDtype(ordered=True))

0    2
1    2
2    2
dtype: category
Categories (1, int64): [2]

In [307]:
cat > cat_base

0     True
1    False
2    False
dtype: bool

In [308]:
cat == cat_base

0    False
1     True
2    False
dtype: bool

In [309]:
c1 = pd.Categorical(['a', 'b'], categories=['a', 'b'], ordered=False)

['a', 'b']
Categories (2, object): ['a', 'b']

In [310]:
c2 = pd.Categorical(['a', 'b'], categories=['b', 'a'], ordered=False)

['a', 'b']
Categories (2, object): ['b', 'a']

In [311]:
type(c2)

pandas.core.arrays.categorical.Categorical

In [312]:
idx = pd.Index(["h", "i", "j", "k", "l", "m", "n"])

Index(['h', 'i', 'j', 'k', 'l', 'm', 'n'], dtype='object')

In [313]:
cats = pd.Series(["a", "b", "b", "b", "c", "c", "c"],
   .....:                  dtype="category", index=idx)

h    a
i    b
j    b
k    b
l    c
m    c
n    c
dtype: category
Categories (3, object): ['a', 'b', 'c']

In [314]:
values = [1, 2, 2, 2, 3, 4, 5]

[1, 2, 2, 2, 3, 4, 5]

In [315]:
df = pd.DataFrame({"cats": cats, "values": values}, index=idx)

Unnamed: 0,cats,values
h,a,1
i,b,2
j,b,2
k,b,2
l,c,3
m,c,4
n,c,5


In [316]:
df.iloc[2:4, :]

Unnamed: 0,cats,values
j,b,2
k,b,2


In [317]:
df.iloc[2:4, :].dtypes

cats      category
values       int64
dtype: object

In [318]:
df.loc["h":"j", "cats"]

h    a
i    b
j    b
Name: cats, dtype: category
Categories (3, object): ['a', 'b', 'c']

In [323]:
df.iloc[2:4, :].dtypes['cats'].categories
df.iloc[2:4, :].dtypes['cats'].ordered

False

In [324]:
df["cats"] == "b"

h    False
i     True
j     True
k     True
l    False
m    False
n    False
Name: cats, dtype: bool

In [None]:
 df[]

In [325]:
s = pd.Series(["a", "b", np.nan, "a"], dtype="category")

0      a
1      b
2    NaN
3      a
dtype: category
Categories (2, object): ['a', 'b']

In [327]:
s.cat.categories

Index(['a', 'b'], dtype='object')

In [328]:
s.cat.codes

0    0
1    1
2   -1
3    0
dtype: int8