In [3]:
import numpy as np
import pandas as pd

In [4]:
# Series or Dataframe followed by .str will show a lot of ufnctions for string manipualtion
names = pd.Series(['andrew', 'bobo', 'claire', 'david', '5'])
names

0    andrew
1      bobo
2    claire
3     david
4         5
dtype: object

In [5]:
names.str.upper() # uppercase all the data

0    ANDREW
1      BOBO
2    CLAIRE
3     DAVID
4         5
dtype: object

In [6]:
# checking for the datatype 
names.str.isdigit() # will return true on 5

0    False
1    False
2    False
3    False
4     True
dtype: bool

In [7]:
tech_finance = ['GOOG,APPL,AMZN', 'JPM,BAC,GS']

In [8]:
tickers = pd.Series(tech_finance)
tickers # not formated correctly

0    GOOG,APPL,AMZN
1        JPM,BAC,GS
dtype: object

In [9]:
tickers.str.split(',') # will split each row into a list of the elements inside

0    [GOOG, APPL, AMZN]
1        [JPM, BAC, GS]
dtype: object

In [11]:
# now we want to make them into 3 columns
tickers.str.split(',', expand = True) # expand automatically does the job for us

Unnamed: 0,0,1,2
0,GOOG,APPL,AMZN
1,JPM,BAC,GS


In [12]:
tickers.str.split(',' , expand= True)

Unnamed: 0,0,1,2
0,GOOG,APPL,AMZN
1,JPM,BAC,GS


In [13]:
messy_names = pd.Series(['andrew  ', 'bo;bo', '  claire'])
messy_names # the names are messy

0    andrew  
1       bo;bo
2      claire
dtype: object

In [15]:
# first string - what you want to replace
# second string - what will it be replaced with
messy_names.str.replace(';', '')

0    andrew  
1        bobo
2      claire
dtype: object

In [17]:
# str.strip() will remove the whitespace
messy_names.str.replace(';', '').str.strip().str.capitalize()
# will remove the whitespace
# Capitalize will capitalize the first letter

0    Andrew
1      Bobo
2    Claire
dtype: object

In [22]:
# if we have really hard operations we can apply a custom call
def cleanup(name):
    name = name.replace(";", "")
    name = name.strip()
    name = name.capitalize()
    return name 

In [23]:
messy_names.apply(cleanup)

0    Andrew
1      Bobo
2    Claire
dtype: object

Pandas str method calls will perform slower than our defined functions, we can always vectorize to imporve performance though

In [24]:
np.vectorize(cleanup)(messy_names)

array(['Andrew', 'Bobo', 'Claire'], dtype='<U6')