<a href="https://colab.research.google.com/github/Nidhi89717/ML/blob/main/13_Text_Methods.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

**Text Methods**

## A normal Python string has a variety of method calls available:

In [None]:
import numpy as np
import pandas as pd

In [None]:
email = 'nid@email.com'

In [None]:
email.split('@')

['nid', 'email.com']

In [None]:
email.isdigit()

False

## Text Methods on Pandas String Column

In [None]:
names = pd.Series(['andrew', 'bobo', 'claire','david', '5'])

In [None]:
names

0    andrew
1      bobo
2    claire
3     david
4         5
dtype: object

In [None]:
names.str.capitalize()

0    Andrew
1      Bobo
2    Claire
3     David
4         5
dtype: object

In [None]:
names.str.isdigit()

0    False
1    False
2    False
3    False
4     True
dtype: bool

In [None]:
names.str.upper()

0    ANDREW
1      BOBO
2    CLAIRE
3     DAVID
4         5
dtype: object

##Splitting , Grabbing, and Expanding

In [None]:
tech_finance = ['GOOG,APPL,AMZN', 'JPM,BAC,GS']

In [None]:
len(tech_finance)

2

In [None]:
tickers = pd.Series(tech_finance)

In [None]:
tickers

0    GOOG,APPL,AMZN
1        JPM,BAC,GS
dtype: object

In [None]:
tickers.str.split(',')

0    [GOOG, APPL, AMZN]
1        [JPM, BAC, GS]
dtype: object

In [None]:
tickers.str.split(',').str[0]

0    GOOG
1     JPM
dtype: object

In [None]:
tickers.str.split(',',expand=True)

Unnamed: 0,0,1,2
0,GOOG,APPL,AMZN
1,JPM,BAC,GS


## Cleaning or Editing Strings

In [None]:
messy_names = pd.Series(['andrew  ','bo;bo',"  claire  "])

In [None]:
messy_names

0      andrew  
1         bo;bo
2      claire  
dtype: object

In [None]:
messy_names[0]

'andrew  '

In [None]:
messy_names.str.replace(';','')

0      andrew  
1          bobo
2      claire  
dtype: object

In [None]:
messy_names.str.replace(';','').str.strip()

0    andrew
1      bobo
2    claire
dtype: object

In [None]:
messy_names.str.replace(';','').str.strip()[0]

'andrew'

In [None]:
messy_names.str.replace(';','').str.strip().str.capitalize()

0    Andrew
1      Bobo
2    Claire
dtype: object

## Alternative with Custom apply() call

In [None]:
def cleanup(name):
  name = name.replace(';','')
  name = name.strip()
  name = name.capitalize()
  return name

In [None]:
messy_names.apply(cleanup)

0    Andrew
1      Bobo
2    Claire
dtype: object

## Which one is more efficient?

In [34]:
import timeit 
  
# code snippet to be executed only once 
setup = '''
import pandas as pd
import numpy as np
messy_names = pd.Series(["andrew  ","bo;bo","  claire  "])
def cleanup(name):
    name = name.replace(";","")
    name = name.strip()
    name = name.capitalize()
    return name
'''
  
# code snippet whose execution time is to be measured 
stmt_pandas_str = ''' 
messy_names.str.replace(";","").str.strip().str.capitalize()
'''

stmt_pandas_apply = '''
messy_names.apply(cleanup)
'''

stmt_pandas_vectorize='''
np.vectorize(cleanup)(messy_names)
'''

In [35]:
timeit.timeit(setup = setup, 
                    stmt = stmt_pandas_str, 
                    number = 10000) 

7.14028006100034

In [36]:
timeit.timeit(setup = setup, 
                    stmt = stmt_pandas_apply, 
                    number = 10000) 

1.5398060950001309

In [37]:
timeit.timeit(setup = setup, 
                    stmt = stmt_pandas_vectorize, 
                    number = 10000) 

0.29595707199950994