# STRING METHODS

In [63]:
import pandas as pd
import numpy as np

### String methods in pandas

- capitalize
- casefold
- cat
- center
- count
- decode
- encode
- endswith
- extract
- extractall
- find
- findall
- fullmatch
- get
- get_dummies
- index
- isalnum
- isalpha
- isdecimal
- isdigit
- islower
- isnumeric
- isspace
- istitle
- isupper
- translate
- join
- len
- lstrip
- lower
- match
- normalize
- pad
- partition
- repeat
- replace
- rfind
- rindex
- rjust
- rpartition
- rsplit
- rstrip
- slice
- slice_replace
- split
- startswith
- strip
- swapcase
- title
- upper
- wrap
- zfill

In [4]:
ser = pd.Series( ['washington', 'brooklyn', 'omaha', 'pittsburgh'])
ser

0    washington
1      brooklyn
2         omaha
3    pittsburgh
dtype: object

##### Change the first letter in each word to uppercase

In [None]:
# method one
ser.str.capitalize()

# method 2
ser.map(lambda x: x.title())

# method 3
ser.map(lambda x: x[0].upper() + x[1:])

# method 4
pd.Series([i.title() for i in ser])

##### Capitalize

In [50]:
# method one
ser.str.capitalize()

# method 2
ser.map(lambda x: x.title())

# method 3
ser.map(lambda x: x[0].upper() + x[1:])

# method 4
pd.Series([i.capitalize() for i in ser])

0    Washington
1      Brooklyn
2         Omaha
3    Pittsburgh
dtype: object

##### Casefold

In [6]:
ser.str.casefold()

0    washington
1      brooklyn
2         omaha
3    pittsburgh
dtype: object

##### Cat

In [11]:
ser.str.cat(sep=' - ')

'washington - brooklyn - omaha - pittsburgh'

In [12]:
new_ser = pd.Series(['london', 'barcelona', 'rome'])
ser.str.cat(others=new_ser, sep=' - ')

0     washington - london
1    brooklyn - barcelona
2            omaha - rome
3                     NaN
dtype: object

##### Center

In [84]:
ser.str.center(12, fillchar='-')


0    -washington-
1    --brooklyn--
2    ---omaha----
3    -pittsburgh-
dtype: object

##### Count
Count occurrences of pattern in each string of the Series/Index.

In [25]:
ser.str.count(pat='a')

0    1
1    0
2    2
3    0
dtype: int64

In [28]:
ser[ser.str.count(pat='a') >= 2]

2    omaha
dtype: object

##### Endswith

In [26]:
ser.str.endswith('n')

0     True
1     True
2    False
3    False
dtype: bool

In [27]:
ser[ser.str.endswith('n')]

0    washington
1      brooklyn
dtype: object

##### Extract
- Extract capture groups in the regex pat as columns in a DataFrame.
- For each subject string in the Series, extract groups from the first match of regular expression pat.

In [35]:
s = pd.Series(['a1', 'b2', 'c3'])
s.str.extract(r'([ab])(\d)')

Unnamed: 0,0,1
0,a,1.0
1,b,2.0
2,,


#### Extractall
- Extract capture groups in the regex pat as columns in DataFrame.
- For each subject string in the Series, extract groups from all matches of regular expression pat. When each subject string in the Series has exactly one match, extractall(pat).xs(0, level=’match’) is the same as extract(pat).


In [36]:
s = pd.Series(["a1a2", "b1", "c1"], index=["A", "B", "C"])
s.str.extractall(r"[ab](\d)")

Unnamed: 0_level_0,Unnamed: 1_level_0,0
Unnamed: 0_level_1,match,Unnamed: 2_level_1
A,0,1
A,1,2
B,0,1


##### Find
- Return lowest indexes in each strings in the Series/Index.
- Each of returned indexes corresponds to the position where the substring is fully contained between `[start:end]`. Return `-1` on failure. 

In [38]:
ser

0    washington
1      brooklyn
2         omaha
3    pittsburgh
dtype: object

In [37]:
ser.str.find('h')

0    3
1   -1
2    3
3    9
dtype: int64

##### Findall

In [47]:
import re
ser.str.findall('Washington', flags=re.IGNORECASE)

0    [washington]
1              []
2              []
3              []
dtype: object

##### Fullmatch
- Determine if each string entirely matches a regular expression.

In [48]:
ser.str.fullmatch('omaha')

0    False
1    False
2     True
3    False
dtype: bool

##### Get
- Extract element from each component at specified position.
- Extract element from lists, tuples, or strings in each element in the Series/Index.

In [51]:
ser.str.get(-1)

0    n
1    n
2    a
3    h
dtype: object

In [57]:
ser

0    washington
1      brooklyn
2         omaha
3    pittsburgh
dtype: object

##### Index
- Return lowest indexes in each string in Series/Index.
- Raises a ValueError when the substring is not found. 

In [62]:
s = pd.Series(['abc', 'dab', 'ffabtr'])
s.str.index('ab')

0    0
1    1
2    2
dtype: int64

#### Join
- Join lists contained as elements in the Series/Index with passed delimiter.
- Join all lists using a ‘-’. The lists containing object(s) of types other than str will produce a NaN.

In [65]:
s = pd.Series(
    [
        ['lion', 'elephant', 'zebra'],
        [1.1, 2.2, 3.3],
        ['cat', np.nan, 'dog'],
        ['cow', 4.5, 'goat'],
        ['duck', ['swan', 'fish'], 'guppy']
    ])

s.str.join(sep='-')

0    lion-elephant-zebra
1                    NaN
2                    NaN
3                    NaN
4                    NaN
dtype: object

##### Len
- Compute the length of each element in the Series/Index.
- The element may be a sequence (such as a string, tuple or list) or a collection (such as a dictionary).

In [67]:
s = pd.Series(['dog', '', 5, {'foo' : 'bar'}, [2, 3, 5, 7], ('one', 'two', 'three')])

s.str.len()

0    3.0
1    0.0
2    NaN
3    1.0
4    4.0
5    3.0
dtype: float64

##### Ljust
- Pad right side of strings in the Series/Index.

In [82]:
ser.str.ljust(10, fillchar='-')

0    washington
1    brooklyn--
2    omaha-----
3    pittsburgh
dtype: object

##### Lower
- Convert strings in the Series/Index to lowercase.

In [85]:
ser.str.lower()

0    washington
1      brooklyn
2         omaha
3    pittsburgh
dtype: object

In [93]:
s = pd.Series(['1. London', '2. Barcelona', '3. Madrid'])
s.str.lstrip('.123')

0        London
1     Barcelona
2        Madrid
dtype: object

##### Match
- Determine if each string starts with a match of a regular expression.

In [95]:
ser.str.match('om')

0    False
1    False
2     True
3    False
dtype: bool

#### Normalize
- Return the Unicode normal form for the strings in the Series/Index.

##### Pad

In [99]:
ser.str.pad(width=12, side='left', fillchar='-')

0    --washington
1    ----brooklyn
2    -------omaha
3    --pittsburgh
dtype: object

In [98]:
ser.str.pad(width=12, side='both', fillchar='-')

0    -washington-
1    --brooklyn--
2    ---omaha----
3    -pittsburgh-
dtype: object

##### Partition
- Split the string at the first occurrence of sep.


In [100]:
s = pd.Series(['Linda van der Berg', 'George Pitt-Rivers'])
s.str.partition()

Unnamed: 0,0,1,2
0,Linda,,van der Berg
1,George,,Pitt-Rivers


In [102]:
s.str.partition('-', expand=False)

0    (Linda van der Berg, , )
1    (George Pitt, -, Rivers)
dtype: object

In [103]:
s.str.partition('-', expand=True)

Unnamed: 0,0,1,2
0,Linda van der Berg,,
1,George Pitt,-,Rivers


In [91]:
s = pd.Series(['1. Ant.  ', '2. Bee!\n', '3. Cat?\t', np.nan])
s.str.rstrip(to_strip='.!? \n\t')
              

0    1. Ant
1    2. Bee
2    3. Cat
3       NaN
dtype: object