### 1 Handling missing value

Missing values are usually denot by NaN

In [2]:
import numpy as np 
import pandas as pd

In [3]:
srs=pd.Series([1,2,np.nan,3,np.nan])
srs

0    1.0
1    2.0
2    NaN
3    3.0
4    NaN
dtype: float64

In [4]:
srs.isnull()

0    False
1    False
2     True
3    False
4     True
dtype: bool

In [5]:
srs.fillna(0)

0    1.0
1    2.0
2    0.0
3    3.0
4    0.0
dtype: float64

In [6]:
srs.dropna()

0    1.0
1    2.0
3    3.0
dtype: float64

In [7]:
srs[srs.notnull()]  #similar to drpona

0    1.0
1    2.0
3    3.0
dtype: float64

##### Filtering Out Missing Data


In [8]:
from numpy import nan as NA
df= pd.DataFrame([[1., 6.5, 3.], [1., NA, NA],[NA, NA, NA], [NA, 6.5, 3.]])
df


Unnamed: 0,0,1,2
0,1.0,6.5,3.0
1,1.0,,
2,,,
3,,6.5,3.0


With DataFrame objects, things are a bit more complex. You may want to drop rows
or columns that are all NA or only those containing any NAs. dropna by default drops
any row containing a missing value:

In [9]:
df.dropna()

Unnamed: 0,0,1,2
0,1.0,6.5,3.0


Passing how='all' will only drop rows that are all NA:

In [10]:
df.dropna(how='all')

Unnamed: 0,0,1,2
0,1.0,6.5,3.0
1,1.0,,
3,,6.5,3.0


To drop column wise

In [11]:
df.dropna(axis=1)

0
1
2
3


In [12]:
df.dropna(axis=1, how='all')

Unnamed: 0,0,1,2
0,1.0,6.5,3.0
1,1.0,,
2,,,
3,,6.5,3.0


In [13]:
df[4]=NA
df

Unnamed: 0,0,1,2,4
0,1.0,6.5,3.0,
1,1.0,,,
2,,,,
3,,6.5,3.0,


In [14]:
df.dropna(axis='columns',how='all')

Unnamed: 0,0,1,2
0,1.0,6.5,3.0
1,1.0,,
2,,,
3,,6.5,3.0


In [15]:
df=pd.DataFrame(np.random.randn(7,3))
df

Unnamed: 0,0,1,2
0,0.162736,-0.920463,-0.075118
1,1.985325,0.99164,0.866905
2,0.110583,-0.907618,0.817024
3,0.038454,1.279644,-0.649738
4,-0.176368,-0.767712,-0.531997
5,-1.006508,1.016571,0.551365
6,1.639367,-0.59867,0.292286


In [16]:
df.iloc[:4,1]=NA
df.iloc[:2,2]=NA
df

Unnamed: 0,0,1,2
0,0.162736,,
1,1.985325,,
2,0.110583,,0.817024
3,0.038454,,-0.649738
4,-0.176368,-0.767712,-0.531997
5,-1.006508,1.016571,0.551365
6,1.639367,-0.59867,0.292286


In the dropna method of a Pandas DataFrame, the thresh parameter is used to specify a threshold for the number of non-null values. This parameter determines the minimum number of non-null values required for a row or column to be retained after calling dropna.

In [17]:
df.dropna(thresh=2)

Unnamed: 0,0,1,2
2,0.110583,,0.817024
3,0.038454,,-0.649738
4,-0.176368,-0.767712,-0.531997
5,-1.006508,1.016571,0.551365
6,1.639367,-0.59867,0.292286


In [18]:
df

Unnamed: 0,0,1,2
0,0.162736,,
1,1.985325,,
2,0.110583,,0.817024
3,0.038454,,-0.649738
4,-0.176368,-0.767712,-0.531997
5,-1.006508,1.016571,0.551365
6,1.639367,-0.59867,0.292286


In [19]:
df.iloc[df[2].dropna().index]

Unnamed: 0,0,1,2
2,0.110583,,0.817024
3,0.038454,,-0.649738
4,-0.176368,-0.767712,-0.531997
5,-1.006508,1.016571,0.551365
6,1.639367,-0.59867,0.292286


#### Filling in missing data

In [20]:
df

Unnamed: 0,0,1,2
0,0.162736,,
1,1.985325,,
2,0.110583,,0.817024
3,0.038454,,-0.649738
4,-0.176368,-0.767712,-0.531997
5,-1.006508,1.016571,0.551365
6,1.639367,-0.59867,0.292286


In [21]:
df.fillna(0)

Unnamed: 0,0,1,2
0,0.162736,0.0,0.0
1,1.985325,0.0,0.0
2,0.110583,0.0,0.817024
3,0.038454,0.0,-0.649738
4,-0.176368,-0.767712,-0.531997
5,-1.006508,1.016571,0.551365
6,1.639367,-0.59867,0.292286


Calling fillna with a dict, you can use a different fill value for each column:

In [22]:
df.fillna({1:0.5,2:0.25},inplace=True)

In [23]:
df

Unnamed: 0,0,1,2
0,0.162736,0.5,0.25
1,1.985325,0.5,0.25
2,0.110583,0.5,0.817024
3,0.038454,0.5,-0.649738
4,-0.176368,-0.767712,-0.531997
5,-1.006508,1.016571,0.551365
6,1.639367,-0.59867,0.292286


In [24]:
df.iloc[3:,1]=np.nan
df.iloc[5:,2]=np.nan
df

Unnamed: 0,0,1,2
0,0.162736,0.5,0.25
1,1.985325,0.5,0.25
2,0.110583,0.5,0.817024
3,0.038454,,-0.649738
4,-0.176368,,-0.531997
5,-1.006508,,
6,1.639367,,


In [25]:
df.fillna(method='ffill')

Unnamed: 0,0,1,2
0,0.162736,0.5,0.25
1,1.985325,0.5,0.25
2,0.110583,0.5,0.817024
3,0.038454,0.5,-0.649738
4,-0.176368,0.5,-0.531997
5,-1.006508,0.5,-0.531997
6,1.639367,0.5,-0.531997


In [26]:
df.fillna(method='ffill',limit=2)

Unnamed: 0,0,1,2
0,0.162736,0.5,0.25
1,1.985325,0.5,0.25
2,0.110583,0.5,0.817024
3,0.038454,0.5,-0.649738
4,-0.176368,0.5,-0.531997
5,-1.006508,,-0.531997
6,1.639367,,-0.531997


In [27]:
df.iloc[1,1]=np.nan
df

Unnamed: 0,0,1,2
0,0.162736,0.5,0.25
1,1.985325,,0.25
2,0.110583,0.5,0.817024
3,0.038454,,-0.649738
4,-0.176368,,-0.531997
5,-1.006508,,
6,1.639367,,


In [28]:
df.fillna(method='ffill',limit=2)

Unnamed: 0,0,1,2
0,0.162736,0.5,0.25
1,1.985325,0.5,0.25
2,0.110583,0.5,0.817024
3,0.038454,0.5,-0.649738
4,-0.176368,0.5,-0.531997
5,-1.006508,,-0.531997
6,1.639367,,-0.531997


#### Removing duplicate

In [29]:
df=pd.DataFrame({'k1':['one','two']*3+['two'],'k2': [1, 1, 2, 3, 3, 4, 4]})
df

Unnamed: 0,k1,k2
0,one,1
1,two,1
2,one,2
3,two,3
4,one,3
5,two,4
6,two,4


In [30]:
def f(x):
    return x.value_counts()
df.apply(f).fillna("")

Unnamed: 0,k1,k2
1,,2.0
2,,1.0
3,,2.0
4,,2.0
one,3.0,
two,4.0,


In [31]:
df.apply(lambda x : pd.unique(x))

k1      [one, two]
k2    [1, 2, 3, 4]
dtype: object

In [32]:
df.apply(pd.unique)

k1      [one, two]
k2    [1, 2, 3, 4]
dtype: object

The DataFrame method duplicated returns a boolean Series indicating whether each
row is a duplicate (has been observed in a previous row) or not:

In [33]:
df.duplicated()

0    False
1    False
2    False
3    False
4    False
5    False
6     True
dtype: bool

In [34]:
df.drop_duplicates()

Unnamed: 0,k1,k2
0,one,1
1,two,1
2,one,2
3,two,3
4,one,3
5,two,4


In [35]:
df.drop_duplicates(inplace=True)
df

Unnamed: 0,k1,k2
0,one,1
1,two,1
2,one,2
3,two,3
4,one,3
5,two,4


In [36]:
df=df.reindex(np.arange(0,7),axis='rows')
df

Unnamed: 0,k1,k2
0,one,1.0
1,two,1.0
2,one,2.0
3,two,3.0
4,one,3.0
5,two,4.0
6,,


In [37]:
df.iloc[6]=['two',4]
df

Unnamed: 0,k1,k2
0,one,1.0
1,two,1.0
2,one,2.0
3,two,3.0
4,one,3.0
5,two,4.0
6,two,4.0


In [38]:
df['k2'].duplicated()

0    False
1     True
2    False
3    False
4     True
5    False
6     True
Name: k2, dtype: bool

In [39]:
df.drop(df[df['k2'].duplicated()].index,axis='rows')

Unnamed: 0,k1,k2
0,one,1.0
2,one,2.0
3,two,3.0
5,two,4.0


In [40]:
df

Unnamed: 0,k1,k2
0,one,1.0
1,two,1.0
2,one,2.0
3,two,3.0
4,one,3.0
5,two,4.0
6,two,4.0


In [41]:
df.drop_duplicates('k1')

Unnamed: 0,k1,k2
0,one,1.0
1,two,1.0


In [42]:
df['v1']=np.arange(0,7)
df

Unnamed: 0,k1,k2,v1
0,one,1.0,0
1,two,1.0,1
2,one,2.0,2
3,two,3.0,3
4,one,3.0,4
5,two,4.0,5
6,two,4.0,6


In [43]:
df.drop_duplicates(['k1','k2'],keep='last')

Unnamed: 0,k1,k2,v1
0,one,1.0,0
1,two,1.0,1
2,one,2.0,2
3,two,3.0,3
4,one,3.0,4
6,two,4.0,6


#### Transforming Data Using a Function or Mapping

In [44]:
df=pd.DataFrame({'Food':['Dosa','chapathi','Idly','poori','Rava_idly'],'quantity':[1,2,1,3,2]})
df

Unnamed: 0,Food,quantity
0,Dosa,1
1,chapathi,2
2,Idly,1
3,poori,3
4,Rava_idly,2


suppose u want to add type of core ingrediemt that each food came from let us write the dictionary of food and core ingredient.

In [45]:
item={'dosa':'rice','chapathi':'wheat','idly':'rice','poori':'wheat','rava_idly':'rava'}
item

{'dosa': 'rice',
 'chapathi': 'wheat',
 'idly': 'rice',
 'poori': 'wheat',
 'rava_idly': 'rava'}

The map method on a Series accepts a function or dict-like object containing a map‐
ping, but here we have a small problem in that some of the food contains uppercase and some not
. Thus, we need to convert each value to lowercase using the str.lower
Series method:

In [46]:
lowerd=df.Food.str.lower()
lowerd

0         dosa
1     chapathi
2         idly
3        poori
4    rava_idly
Name: Food, dtype: object

In [47]:
df['ingredient']=lowerd.map(item)
df

Unnamed: 0,Food,quantity,ingredient
0,Dosa,1,rice
1,chapathi,2,wheat
2,Idly,1,rice
3,poori,3,wheat
4,Rava_idly,2,rava


In [48]:
lowerd.map(item)

0     rice
1    wheat
2     rice
3    wheat
4     rava
Name: Food, dtype: object

In [49]:
df1=df.iloc[:,:2]
df1

Unnamed: 0,Food,quantity
0,Dosa,1
1,chapathi,2
2,Idly,1
3,poori,3
4,Rava_idly,2


In [50]:
df1['Ingredient']=df1['Food'].map(lambda x:item[x.lower()])
df1

Unnamed: 0,Food,quantity,Ingredient
0,Dosa,1,rice
1,chapathi,2,wheat
2,Idly,1,rice
3,poori,3,wheat
4,Rava_idly,2,rava


### Replacing Values.

In [51]:
data = pd.Series([1., -999., 2., -999., -1000., 3.])
data

0       1.0
1    -999.0
2       2.0
3    -999.0
4   -1000.0
5       3.0
dtype: float64

In [52]:
data.replace(-999,np.nan)

0       1.0
1       NaN
2       2.0
3       NaN
4   -1000.0
5       3.0
dtype: float64

If you want to replace multiple values at once, you instead pass a list and then the
substitute value:

In [53]:
data.replace([-999,-1000],np.nan)


0    1.0
1    NaN
2    2.0
3    NaN
4    NaN
5    3.0
dtype: float64

To use a different replacement for each value, pass a list of substitutes:

In [54]:
data.replace([-999,-1000],[np.nan,0])

0    1.0
1    NaN
2    2.0
3    NaN
4    0.0
5    3.0
dtype: float64

one can also pass dictionary

In [55]:
data.replace({-999:np.nan,-1000:0})

0    1.0
1    NaN
2    2.0
3    NaN
4    0.0
5    3.0
dtype: float64

In [56]:
data = pd.DataFrame(np.arange(12).reshape((3, 4)),
index=['Ohio', 'Colorado', 'New York'],
columns=['one', 'two', 'three', 'four'])
data

Unnamed: 0,one,two,three,four
Ohio,0,1,2,3
Colorado,4,5,6,7
New York,8,9,10,11


In [57]:
data.index=data.index.str.upper()
data

Unnamed: 0,one,two,three,four
OHIO,0,1,2,3
COLORADO,4,5,6,7
NEW YORK,8,9,10,11


In [58]:
data = pd.DataFrame(np.arange(12).reshape((3, 4)),
index=['Ohio', 'Colorado', 'New York'],
columns=['one', 'two', 'three', 'four'])
data

Unnamed: 0,one,two,three,four
Ohio,0,1,2,3
Colorado,4,5,6,7
New York,8,9,10,11


In [59]:
data.index.map(lambda x :x.upper())

Index(['OHIO', 'COLORADO', 'NEW YORK'], dtype='object')

In [60]:
data

Unnamed: 0,one,two,three,four
Ohio,0,1,2,3
Colorado,4,5,6,7
New York,8,9,10,11


In [61]:
data.index=data.index.map(lambda x : x[:4].upper())
data

Unnamed: 0,one,two,three,four
OHIO,0,1,2,3
COLO,4,5,6,7
NEW,8,9,10,11


If you want to create a transformed version of a dataset without modifying the origi‐
nal, a useful method is rename:

In [62]:
data = pd.DataFrame(np.arange(12).reshape((3, 4)),
index=['Ohio', 'Colorado', 'New York'],
columns=['one', 'two', 'three', 'four'])
data

Unnamed: 0,one,two,three,four
Ohio,0,1,2,3
Colorado,4,5,6,7
New York,8,9,10,11


In [63]:
data.rename(columns=str.upper,index=str.title)

Unnamed: 0,ONE,TWO,THREE,FOUR
Ohio,0,1,2,3
Colorado,4,5,6,7
New York,8,9,10,11


Notably, rename can be used in conjunction with a dict-like object providing new val‐
ues for a subset of the axis labels:

In [67]:
data.rename(index={'Ohio': 'INDIANA'},
columns={'three': 'peekaboo'})

Unnamed: 0,one,two,peekaboo,four
INDIANA,0,1,2,3
Colorado,4,5,6,7
New York,8,9,10,11


###  Discretization and Binning

Continuous data is often discretized or otherwise separated into “bins” for analysis.
Suppose you have data about a group of people in a study, and you want to group
them into discrete age buckets:

In [69]:
ages = [20, 22, 25, 27, 21, 23, 37, 31, 61, 45, 41, 32]

In [70]:
bins=[18,25,35,60,100]

In [71]:
a=pd.cut(ages,bins)

In [72]:
a

[(18, 25], (18, 25], (18, 25], (25, 35], (18, 25], ..., (25, 35], (60, 100], (35, 60], (35, 60], (25, 35]]
Length: 12
Categories (4, interval[int64, right]): [(18, 25] < (25, 35] < (35, 60] < (60, 100]]

In [73]:
a.value_counts()

(18, 25]     5
(25, 35]     3
(35, 60]     3
(60, 100]    1
dtype: int64

In [75]:
a.codes

array([0, 0, 0, 1, 0, 0, 2, 1, 3, 2, 2, 1], dtype=int8)

In [76]:
a.categories

IntervalIndex([(18, 25], (25, 35], (35, 60], (60, 100]], dtype='interval[int64, right]')

Consistent with mathematical notation for intervals, a parenthesis means that the side
is open, while the square bracket means it is closed (inclusive). You can change which
side is closed by passing right=False:

In [77]:
b=pd.cut(ages,[18,25,35,60,100],right=False)
b

[[18, 25), [18, 25), [25, 35), [25, 35), [18, 25), ..., [25, 35), [60, 100), [35, 60), [35, 60), [25, 35)]
Length: 12
Categories (4, interval[int64, left]): [[18, 25) < [25, 35) < [35, 60) < [60, 100)]

In [78]:
b.categories

IntervalIndex([[18, 25), [25, 35), [35, 60), [60, 100)], dtype='interval[int64, left]')

In [80]:
b.codes

array([0, 0, 1, 1, 0, 0, 2, 1, 3, 2, 2, 1], dtype=int8)

You can also pass your own bin names by passing a list or array to the labels option:

In [84]:
b=pd.cut(ages,[18,25,35,60,100],labels=['Youth','YoungAdult','Middle_aged','Old'])
b.codes

array([0, 0, 0, 1, 0, 0, 2, 1, 3, 2, 2, 1], dtype=int8)

In [85]:
b.categories

Index(['Youth', 'YoungAdult', 'Middle_aged', 'Old'], dtype='object')

In [87]:
print(b)

['Youth', 'Youth', 'Youth', 'YoungAdult', 'Youth', ..., 'YoungAdult', 'Old', 'Middle_aged', 'Middle_aged', 'YoungAdult']
Length: 12
Categories (4, object): ['Youth' < 'YoungAdult' < 'Middle_aged' < 'Old']


In [88]:
b.value_counts()

Youth          5
YoungAdult     3
Middle_aged    3
Old            1
dtype: int64

If you pass an integer number of bins to cut instead of explicit bin edges, it will com‐
pute equal-length bins based on the minimum and maximum values in the data.
Consider the case of some uniformly distributed data chopped into fourths:

In [90]:
data=np.random.rand(20)
data

array([0.3909184 , 0.09273468, 0.44657437, 0.14646659, 0.0872274 ,
       0.88484569, 0.4008184 , 0.37521578, 0.31155745, 0.63112303,
       0.68831835, 0.96244751, 0.78994559, 0.45751741, 0.08526638,
       0.85032561, 0.14579996, 0.54120508, 0.70818653, 0.34443762])

In [94]:
c=pd.cut(data,4,right=False,labels=['1st','2nd','3rd','4th'])

In [96]:
c

['2nd', '1st', '2nd', '1st', '1st', ..., '4th', '1st', '3rd', '3rd', '2nd']
Length: 20
Categories (4, object): ['1st' < '2nd' < '3rd' < '4th']

In [95]:
c.value_counts()

1st    5
2nd    7
3rd    4
4th    4
dtype: int64

In [97]:
d=pd.cut(data,4,precision=2)
d

[(0.3, 0.52], (0.084, 0.3], (0.3, 0.52], (0.084, 0.3], (0.084, 0.3], ..., (0.74, 0.96], (0.084, 0.3], (0.52, 0.74], (0.52, 0.74], (0.3, 0.52]]
Length: 20
Categories (4, interval[float64, right]): [(0.084, 0.3] < (0.3, 0.52] < (0.52, 0.74] < (0.74, 0.96]]

###### qcut

A closely related function, qcut, bins the data based on sample quantiles. Depending
on the distribution of the data, using cut will not usually result in each bin having the
same number of data points. Since qcut uses sample quantiles instead, by definition
you will obtain roughly equal-size bins:

In [99]:
e=pd.qcut(data,4)
e.value_counts()

(0.0843, 0.27]    5
(0.27, 0.424]     5
(0.424, 0.693]    5
(0.693, 0.962]    5
dtype: int64

In [100]:
e

[(0.27, 0.424], (0.0843, 0.27], (0.424, 0.693], (0.0843, 0.27], (0.0843, 0.27], ..., (0.693, 0.962], (0.0843, 0.27], (0.424, 0.693], (0.693, 0.962], (0.27, 0.424]]
Length: 20
Categories (4, interval[float64, right]): [(0.0843, 0.27] < (0.27, 0.424] < (0.424, 0.693] < (0.693, 0.962]]

### Detecting and Filtering Outliers


In [105]:
data=pd.DataFrame(np.random.randn(1000,4))
data.head(3)

Unnamed: 0,0,1,2,3
0,-0.139635,-0.296879,0.690911,2.292151
1,-1.07607,-0.733719,1.628591,0.584475
2,0.586906,-0.017627,-0.546823,1.588663


In [106]:
data.describe()

Unnamed: 0,0,1,2,3
count,1000.0,1000.0,1000.0,1000.0
mean,-0.009645,-0.024871,-0.075094,0.048142
std,0.974341,1.0043,1.017515,0.981479
min,-3.267544,-3.258486,-4.017027,-2.889213
25%,-0.653928,-0.730847,-0.755925,-0.619083
50%,0.008127,-0.043329,-0.081534,0.067232
75%,0.666847,0.655383,0.622446,0.646763
max,2.986244,3.19102,3.56709,3.2598


Suppose you wanted to find values in one of the columns exceeding 3 in absolute
value:

In [117]:
data[np.abs(data[1])>3]

Unnamed: 0,0,1,2,3
418,0.225826,-3.099905,0.292579,0.284635
498,0.574112,3.19102,1.24311,0.054928
573,-1.553577,-3.258486,-0.33832,-1.041274


In [118]:
data[~(np.abs(data[1])>3)]

Unnamed: 0,0,1,2,3
0,-0.139635,-0.296879,0.690911,2.292151
1,-1.076070,-0.733719,1.628591,0.584475
2,0.586906,-0.017627,-0.546823,1.588663
3,-1.167133,-0.741436,0.131853,0.982735
4,-0.435999,-0.152300,1.404456,-0.383649
...,...,...,...,...
995,-0.370813,0.635009,-0.410714,1.025589
996,0.296423,0.786177,-0.331175,0.160165
997,-0.335764,-0.827353,-1.067624,-1.504288
998,-0.860340,0.626752,0.881399,-0.392021


To select all rows having a value exceeding 3 or –3, you can use the any method on a
boolean DataFrame

In [125]:
data[(np.abs(data)>3).any(1)]

Unnamed: 0,0,1,2,3
82,-1.987328,-0.877441,-4.017027,-0.212112
84,0.355614,-0.18006,0.19121,3.2598
418,0.225826,-3.099905,0.292579,0.284635
498,0.574112,3.19102,1.24311,0.054928
573,-1.553577,-3.258486,-0.33832,-1.041274
577,-0.057644,0.204965,-1.601703,3.088932
607,0.28576,-1.570562,1.474037,3.081562
642,0.254771,-0.999223,3.100069,-0.106919
644,-3.267544,-0.755826,1.227066,-0.683377
722,-0.995473,1.841538,3.56709,1.044357


In [129]:
data[np.abs(data)>3]=np.sign(data)*3

In [131]:
data.iloc[82,2]

-3.0

In [137]:
a=pd.Series(np.arange(4))
a

0    0
1    1
2    2
3    3
dtype: int32

In [138]:
b=pd.Series(np.arange(4,9))
b

0    4
1    5
2    6
3    7
4    8
dtype: int32

In [139]:
a[a<2]=b
a

0    4
1    5
2    2
3    3
dtype: int32

### Computing Indicator/Dummy Variables

In [142]:
df=pd.DataFrame({'Key':['c','b','c','b','a','b'],'data':np.arange(6)})
df

Unnamed: 0,Key,data
0,c,0
1,b,1
2,c,2
3,b,3
4,a,4
5,b,5


In [143]:
pd.get_dummies(df['Key'])

Unnamed: 0,a,b,c
0,0,0,1
1,0,1,0
2,0,0,1
3,0,1,0
4,1,0,0
5,0,1,0


In [146]:
df[['data']].join(pd.get_dummies(df['Key'],prefix='Key'))

Unnamed: 0,data,Key_a,Key_b,Key_c
0,0,0,0,1
1,1,0,1,0
2,2,0,0,1
3,3,0,1,0
4,4,1,0,0
5,5,0,1,0


#### some of the string method

In [1]:
val='a,b, Guido'

a=val.split(",")
a

['a', 'b', ' Guido']

In [2]:
[x.strip() for x in a]

['a', 'b', 'Guido']

split is often combined with strip to trim whitespace (including line breaks):

In [4]:
val='a,b, guido'
" ".join([x.strip() for x in val.split(',')])

'a b guido'

lstrip removes whitespace at the left
rstrip at right

####  Regular expression

The re module functions fall into three categories: pattern matching, substitution,
and splitting. Naturally these are all related; a regex describes a pattern to locate in the
text, which can then be used for many purposes. Let’s look at a simple example:

In [1]:
import re

suppose we wanted to split a string with a variable number of whitespace characters
(tabs, spaces, and newlines). The regex describing one or more whitespace characters
is \s+:

In [12]:
text = "foo   bbar\t baz \tqux"

re.split('\s+',text)

['foo', 'bbar', 'baz', 'qux']

When you call re.split('\s+', text), the regular expression is first compiled, and
then its split method is called on the passed text. You can compile the regex yourself 
with re.compile, forming a reusable regex object:

In [7]:
space=re.compile('\s+')
space.split(text)


['foo', 'bar', 'baz', 'qux']

If, instead, you wanted to get a list of all patterns matching the regex, you can use the
findall method

In [8]:
re.findall('\s+',text)

['   ', '\t ', ' \t']

match and search are closely related to findall. While findall returns all matches
in a string, search returns only the first match. More rigidly, match only matches at
the beginning of the string. As a less trivial example, let’s consider a block of text and
a regular expression capable of identifying most email addresses.

In [40]:
text = """Dave dave@google.com 
Steve steve@gmail.com
Rob rob@gmail.com 
Ryan ryan@yahoo.com
"""
text

'Dave dave@google.com \nSteve steve@gmail.com\nRob rob@gmail.com \nRyan ryan@yahoo.com\n'

In [41]:
pattern=r'[A-Z0-9._%+-]+@[A-Z0-9.-]+\.[A-Z]{2,4}'

In [42]:
regex=re.compile(pattern,flags=re.IGNORECASE)


In [43]:
regex.findall(text)

['dave@google.com', 'steve@gmail.com', 'rob@gmail.com', 'ryan@yahoo.com']

In [44]:
regex.search(text)

<re.Match object; span=(5, 20), match='dave@google.com'>

.search returns first matching pattern and its index

In [45]:
print(regex.match(text))

None


regex.match returns None, as it only will match if the pattern occurs at the start of the
string

Relatedly, sub will return a new string with occurrences of the pattern replaced by the
a new string

In [46]:
regex.sub("REDACTED",text)

'Dave REDACTED \nSteve REDACTED\nRob REDACTED \nRyan REDACTED\n'

Suppose you wanted to find email addresses and simultaneously segment each
address into its three components: username, domain name, and domain suffix. To
do this, put parentheses around the parts of the pattern to segment:


In [50]:
pattern = r'([A-Z0-9._%+-]+)@([A-Z0-9.-]+)\.([A-Z]{2,4})'
regex=re.compile(pattern , flags=re.IGNORECASE)

In [56]:
a=regex.match('west123@gmail.com')
a.groups()

('west123', 'gmail', 'com')

In [57]:
regex.findall(text)

[('dave', 'google', 'com'),
 ('steve', 'gmail', 'com'),
 ('rob', 'gmail', 'com'),
 ('ryan', 'yahoo', 'com')]

In [60]:
print(regex.sub(r"name: \1 , domain: \2, suffix: \3",text))

Dave name: dave , domain: google, suffix: com 
Steve name: steve , domain: gmail, suffix: com
Rob name: rob , domain: gmail, suffix: com 
Ryan name: ryan , domain: yahoo, suffix: com



##### Vectorized String Functions in pandas

In [65]:
import numpy as np
import pandas as pd
data = {'Dave': 'dave@google.com', 'Steve': 'steve@gmail.com',
'Rob': 'rob@gmail.com', 'Wes': np.nan}
data=pd.Series(data)
data

Dave     dave@google.com
Steve    steve@gmail.com
Rob        rob@gmail.com
Wes                  NaN
dtype: object

In [66]:
data.isnull()

Dave     False
Steve    False
Rob      False
Wes       True
dtype: bool

You can apply string and regular expression methods can be applied (passing a
lambda or other function) to each value using data.map, but it will fail on the NA
(null) values. To cope with this, Series has array-oriented methods for string opera‐
tions that skip NA values. These are accessed through Series’s str attribute; for exam‐
ple, we could check whether each email address has 'gmail' in it with str.contains

In [68]:
regex=re.compile('@gmail')
data[:-1].map(lambda x:regex.findall(x))

Dave           []
Steve    [@gmail]
Rob      [@gmail]
dtype: object

In [69]:
data.map(lambda x:regex.findall(x))  #you  get error due to issing values

TypeError: expected string or bytes-like object

In [70]:
data.str.contains("@gmail")

Dave     False
Steve     True
Rob       True
Wes        NaN
dtype: object

In [71]:
data.str.findall('([A-Z0-9._%+-]+)@([A-Z0-9.-]+)\\.([A-Z]{2,4})',flags=re.IGNORECASE)

Dave     [(dave, google, com)]
Steve    [(steve, gmail, com)]
Rob        [(rob, gmail, com)]
Wes                        NaN
dtype: object