In [2]:
import pandas as pd
import numpy as np

# Working with text data

## Dedicated data type - StringDtype

In [43]:
s_text = pd.Series(['Hey', 'Hi', 'Hello', 'Farewell'])
s_text.dtype

dtype('O')

In [44]:
s_text = pd.Series(['Hey', 'Hi', 'Hello', 'Farewell'], dtype=str)
s_text.dtype

dtype('O')

In [45]:
s_text = pd.Series(['Hey', 'Hi', 'Hello', 'Farewell'], dtype=pd.StringDtype())
s_text.dtype

StringDtype

In [46]:
s_text = pd.Series(['Hey', 'Hi', 'Hello', 'Farewell'], dtype='string')
s_text.dtype

StringDtype

### Interface to str methods from series level

In [64]:
s_text\
.str.upper()\
.str.replace('E', '3')\
.str.split('', expand=True, n=3)

Unnamed: 0,0,1,2,3
0,,H,3,Y
1,,H,I,
2,,H,3,LLO
3,,F,A,R3W3LL


In [77]:
s_text.str.cat(sep='__')

'Hey__Hi__Hello__Farewell'

#### Replacing NA values

In [81]:
s_text_with_na =pd.Series(['Hi', np.nan, None, 'Hello'], dtype='string')
s_text_with_na

0       Hi
1     <NA>
2     <NA>
3    Hello
dtype: string

In [97]:
s_text_with_na.str.cat(sep=' ', na_rep='<missing_token>')

'Hi <missing_token> <missing_token> Hello'

In [98]:
other_text_series = pd.Series(map(str,[1,2,3,4]), dtype=pd.StringDtype())
s_text_with_na.str.cat(other_text_series, sep='_&_')

0       Hi_&_1
1         <NA>
2         <NA>
3    Hello_&_4
dtype: string

In [99]:
s_text_with_na.str.cat(other_text_series, sep='_&_', na_rep='<missing_token>')

0                 Hi_&_1
1    <missing_token>_&_2
2    <missing_token>_&_3
3              Hello_&_4
dtype: string

In [104]:
s_text_with_na.fillna('<missing_token>') + '_&_'+other_text_series

0                 Hi_&_1
1    <missing_token>_&_2
2    <missing_token>_&_3
3              Hello_&_4
dtype: string

### Benchmark string dtype series concat

In [121]:
s_text_big_1 = pd.Series(map(str, np.linspace(1, 1e5, int(1e7))), dtype=pd.StringDtype())
s_text_big_2 = pd.Series(map(str, np.linspace(1e5, 1, int(1e7))), dtype=pd.StringDtype())

In [123]:
%%time
for i in range(10):
    s_text_big_1 + '_' + s_text_big_2

CPU times: user 45.6 s, sys: 7.17 s, total: 52.7 s
Wall time: 52.8 s


In [124]:
%%time
for _ in range(10):
    s_text_big_1.str.cat(s_text_big_2, sep='_')

CPU times: user 39.8 s, sys: 7.88 s, total: 47.7 s
Wall time: 49.5 s


test on smaller series

In [125]:
s_text_small_1 = pd.Series(map(str, np.linspace(1, 1e5, int(1e4))), dtype=pd.StringDtype())
s_text_small_2 = pd.Series(map(str, np.linspace(1e5, 1, int(1e4))), dtype=pd.StringDtype())

In [129]:
%%time
for i in range(1000):
    s_text_small_1 + '_' + s_text_small_2

CPU times: user 3.54 s, sys: 14.6 ms, total: 3.56 s
Wall time: 3.56 s


In [130]:
%%time
for _ in range(1000):
    s_text_small_1.str.cat(s_text_small_2, sep='_')

CPU times: user 3.23 s, sys: 18.7 ms, total: 3.25 s
Wall time: 3.26 s


#### Using standard python positioning

In [143]:
s_text.str[-4:]

0     Hey
1      Hi
2    ello
3    well
dtype: string

### indicator/dummy variables

In [162]:
s_text.str.lower().str.get_dummies(sep='')

Unnamed: 0,a,e,f,h,i,l,o,r,w,y
0,0,1,0,1,0,0,0,0,0,1
1,0,0,0,1,1,0,0,0,0,0
2,0,1,0,1,0,1,1,0,0,0
3,1,1,1,0,0,1,0,1,1,0


### miscellaneous

In [175]:
s_text.str.pad(10, side='right').str.len()

0    10
1    10
2    10
3    10
dtype: Int64