In [1]:
import pandas as pd

#### Usually when we need some pre-processing in our dataset, we are facing some string problems. In our dataset we are facing some unwanted things and we need to eliminate them.

In [19]:
df  = pd.DataFrame(
    [
        (1, '+9A', 100),
        (2, '-1A', 121),
        (3, '5B', 312),
        (4, '+1D', 567),
        (5, '+1C', 123),
        (6, '-2E', 101),
        (7, '+3T', 231),
        (8, '5A', 769),
        (9, '+5B', 907),
        (10, '-1A', 15),
    ],
    columns=['colA', 'colB', 'colC'])

#### In col - B we need to remove prefix signs and suffiex letters.                                                                                           

#### pandas.Series.str.replace() method can be used to replace each occurrence of pattern/regex in the Series / Index .

#### we can specify a regex to replace all non-numeric values into an empty string.

In [6]:
# Method 1
df['colB'] = df['colB'].str.replace(r'\D', '')

# Method 2 
 df['colB'].str.replace(r'[^0-9]', '')

#### another way is using   str.extract()   method 

In [10]:
df['colB'] = df['colB'].str.extract(r'(\d+)', expand=False)
df

Unnamed: 0,colA,colB,colC
0,1,9,100
1,2,1,121
2,3,5,312
3,4,1,567
4,5,1,123
5,6,2,101
6,7,3,231
7,8,5,769
8,9,5,907
9,10,1,15


#### replace() is another alternative method

In [13]:
df['colB'] = df['colB'].replace(r'\D', r'', regex=True)
df

Unnamed: 0,colA,colB,colC
0,1,9,100
1,2,1,121
2,3,5,312
3,4,1,567
4,5,1,123
5,6,2,101
6,7,3,231
7,8,5,769
8,9,5,907
9,10,1,15


####  we can use map() to apply a lambda function that removes +/- from the beginning of the string and any ascii character from the end of the string.

In [15]:
from string import ascii_letters

df['colB'] = df['colB'].map(lambda x: x.lstrip('+-').rstrip(ascii_letters))
df

Unnamed: 0,colA,colB,colC
0,1,9,100
1,2,1,121
2,3,5,312
3,4,1,567
4,5,1,123
5,6,2,101
6,7,3,231
7,8,5,769
8,9,5,907
9,10,1,15


#### instead of using replace we could make use list comprehension

In [18]:
import re

df['colB'] = [re.sub('[^0-9]', '', x) for x in df['colB']]
df

Unnamed: 0,colA,colB,colC
0,1,9,100
1,2,1,121
2,3,5,312
3,4,1,567
4,5,1,123
5,6,2,101
6,7,3,231
7,8,5,769
8,9,5,907
9,10,1,15


#### str.extract() method could be expressed as a list comprehension using re.search() 

In [20]:
import re 

df['colB'] = [re.search('[0-9]', x) [0] for x in df['colB']]
df

Unnamed: 0,colA,colB,colC
0,1,9,100
1,2,1,121
2,3,5,312
3,4,1,567
4,5,1,123
5,6,2,101
6,7,3,231
7,8,5,769
8,9,5,907
9,10,1,15
