# Series string methods
If you working with text dataset it is quite a common task to clean it, split texts or extract some information about samples from their text. To do it we can take advantage of pandas ufuncs

In [3]:
import numpy as np
import pandas as pd
# Restricting number of displaying rows, just for convenience
pd.set_option('max_rows', 8)

In [4]:
movie = pd.read_csv('data/movie.csv')
# Take 1 column-series
directors = movie['director_name']

In [15]:
# Number of characters in string
directors.str.len()

0       13.0
1       14.0
2       10.0
3       17.0
        ... 
4912     NaN
4913    16.0
4914    11.0
4915     8.0
Name: director_name, Length: 4916, dtype: float64

In [16]:
# Starting with something specific
is_starts_with_a = directors.str.startswith('A')
is_ends_with_z = directors.str.endswith('z')
is_starts_with_a

0       False
1       False
2       False
3       False
        ...  
4912      NaN
4913    False
4914    False
4915    False
Name: director_name, Length: 4916, dtype: object

In [14]:
# Whether some value is True or all values are True in a series
is_starts_with_a.any(), is_starts_with_a.all()

(True, False)

In [34]:
# Split strings to a list
directors.str.split(' ')

0           [James, Cameron]
1          [Gore, Verbinski]
2              [Sam, Mendes]
3       [Christopher, Nolan]
                ...         
4912                     NaN
4913     [Benjamin, Roberds]
4914          [Daniel, Hsia]
4915             [Jon, Gunn]
Name: director_name, Length: 4916, dtype: object

In [5]:
name = directors.str.extract(r'(?P<name>\w+)', expand=True)
family = directors.str.extract(r'(?P<surname> \w+)', expand=False)
print(family)
name

0          Cameron
1        Verbinski
2           Mendes
3            Nolan
           ...    
4912           NaN
4913       Roberds
4914          Hsia
4915          Gunn
Name: surname, Length: 4916, dtype: object


Unnamed: 0,name
0,James
1,Gore
2,Sam
3,Christopher
...,...
4912,
4913,Benjamin
4914,Daniel
4915,Jon


In [46]:
# Count something in string
directors.str.count('e')

0       2.0
1       2.0
2       2.0
3       1.0
       ... 
4912    NaN
4913    2.0
4914    1.0
4915    0.0
Name: director_name, Length: 4916, dtype: float64

In [70]:
# Concatenate strings from Series in a 1
directors.str.cat(sep='-')[:100]

"James Cameron-Gore Verbinski-Sam Mendes-Christopher Nolan-Doug Walker-Andrew Stanton-Sam Raimi-Nathan Greno-Joss Whedon-David Yates-Zack Snyder-Bryan Singer-Marc Forster-Gore Verbinski-Gore Verbinski-Zack Snyder-Andrew Adamson-Joss Whedon-Rob Marshall-Barry Sonnenfeld-Peter Jackson-Marc Webb-Ridley Scott-Peter Jackson-Chris Weitz-Peter Jackson-James Cameron-Anthony Russo-Peter Berg-Colin Trevorrow-Sam Mendes-Sam Raimi-Shane Black-Tim Burton-Brett Ratner-Dan Scanlon-Michael Bay-Michael Bay-Sam Raimi-Marc Webb-Joseph Kosinski-John Lasseter-Martin Campbell-Lee Unkrich-McG-James Wan-Marc Forster-Bryan Singer-J.J. Abrams-Bryan Singer-Baz Luhrmann-Mike Newell-Guillermo del Toro-Michael Bay-Steven Spielberg-Peter Sohn-Mark Andrews-Justin Lin-Andrew Stanton-Brett Ratner-Roland Emmerich-Robert Zemeckis-Lana Wachowski-David Yates-Andrew Adamson-Bryan Singer-Christopher Nolan-Pete Docter-Rob Letterman-Jon Favreau-Martin Scorsese-Barry Sonnenfeld-Rob Cohen-David Ayer-Tom Shadyac-Doug Liman-Kevin 

In [69]:
# Concatenate according strings from Series
family.str.cat(directors, sep=', ')

0           Cameron, James Cameron
1        Verbinski, Gore Verbinski
2               Mendes, Sam Mendes
3         Nolan, Christopher Nolan
                   ...            
4912                           NaN
4913     Roberds, Benjamin Roberds
4914             Hsia, Daniel Hsia
4915                Gunn, Jon Gunn
Name: surname, Length: 4916, dtype: object

In [50]:
# Available str methods
list(filter(lambda x: not x.startswith('_'), dir(pd.Series.str)))

['capitalize',
 'cat',
 'center',
 'contains',
 'count',
 'decode',
 'encode',
 'endswith',
 'extract',
 'extractall',
 'find',
 'findall',
 'get',
 'get_dummies',
 'index',
 'isalnum',
 'isalpha',
 'isdecimal',
 'isdigit',
 'islower',
 'isnumeric',
 'isspace',
 'istitle',
 'isupper',
 'join',
 'len',
 'ljust',
 'lower',
 'lstrip',
 'match',
 'normalize',
 'pad',
 'partition',
 'repeat',
 'replace',
 'rfind',
 'rindex',
 'rjust',
 'rpartition',
 'rsplit',
 'rstrip',
 'slice',
 'slice_replace',
 'split',
 'startswith',
 'strip',
 'swapcase',
 'title',
 'translate',
 'upper',
 'wrap',
 'zfill']

## Timings

In [37]:
%%timeit 
directors.str.count('e')

3.93 ms ± 139 µs per loop (mean ± std. dev. of 7 runs, 100 loops each)


In [38]:
%%timeit
es = []
for record in directors:
    try:
        es.append(record.count('e'))
    except:
        es.append(np.nan)

1.5 ms ± 26 µs per loop (mean ± std. dev. of 7 runs, 1000 loops each)


In [39]:
# Check whether series are equal
assert pd.Series.equals(pd.Series(es), directors.str.count('e')), 'Series are not equal'

Come on, python faster vectorized pandas, wtf