# 字符串操作

In [1]:
import numpy as np
import pandas as pd

## 字符串对象方法

In [2]:
val = 'a,b,  guido'

val

'a,b,  guido'

In [3]:
val.split(',')

['a', 'b', '  guido']

In [4]:
pieces = [x.strip() for x in val.split(',')]

pieces

['a', 'b', 'guido']

In [5]:
first, second, third = pieces

first + '::' + second + '::' + third

'a::b::guido'

In [6]:
'::'.join(pieces)

'a::b::guido'

In [7]:
'guido' in val

True

In [8]:
val.index(',')

1

In [9]:
val.find(':')

-1

In [10]:
val.index(':')

ValueError: substring not found

In [11]:
val.count(',')

2

In [12]:
val.replace(',',  '::')

'a::b::  guido'

In [13]:
val.replace(',', '')

'ab  guido'

## 正则表达式

In [8]:
import re

In [15]:
text = "foo  bar\t baz  \tqux"

text

'foo  bar\t baz  \tqux'

In [16]:
re.split('\s+', text)

['foo', 'bar', 'baz', 'qux']

In [17]:
regex = re.compile('\s+')

regex

re.compile(r'\s+', re.UNICODE)

In [18]:
type(regex)

_sre.SRE_Pattern

In [19]:
regex.split(text)

['foo', 'bar', 'baz', 'qux']

In [20]:
regex.findall(text)

['  ', '\t ', '  \t']

## pandas中矢量化的字符串函数

In [2]:
data = {'Dave': 'dave@google.com', 'Steve': 'steve@mail.com',
       'Rob': 'rob@gmail.com', 'Wes': np.nan}

data

{'Dave': 'dave@google.com',
 'Rob': 'rob@gmail.com',
 'Steve': 'steve@mail.com',
 'Wes': nan}

In [3]:
data = pd.Series(data)

data

Dave     dave@google.com
Rob        rob@gmail.com
Steve     steve@mail.com
Wes                  NaN
dtype: object

In [4]:
data.isnull()

Dave     False
Rob      False
Steve    False
Wes       True
dtype: bool

In [5]:
data.str.contains('gmail')

Dave     False
Rob       True
Steve    False
Wes        NaN
dtype: object

In [6]:
pattern = '([A-Z0-9._%+-]+)@([A-Z0-9.-]+)\\.([A-Z]{2,4})'

pattern

'([A-Z0-9._%+-]+)@([A-Z0-9.-]+)\\.([A-Z]{2,4})'

In [9]:
data.str.findall(pattern, flags=re.IGNORECASE)

Dave     [(dave, google, com)]
Rob        [(rob, gmail, com)]
Steve     [(steve, mail, com)]
Wes                        NaN
dtype: object

In [10]:
matches = data.str.match(pattern, flags=re.IGNORECASE)

matches

Dave     True
Rob      True
Steve    True
Wes       NaN
dtype: object

In [11]:
matches.str.get(1)

Dave    NaN
Rob     NaN
Steve   NaN
Wes     NaN
dtype: float64

In [12]:
data

Dave     dave@google.com
Rob        rob@gmail.com
Steve     steve@mail.com
Wes                  NaN
dtype: object

In [13]:
pwd

'C:\\Users\\hasee\\Documents\\Python Scripts\\MyGit\\Data Analysis\\利用Python进行数据分析\\第7章 数据规整化：清理、转换、合并、重塑'