In [17]:
import re

# 摂氏から華氏への変換

## 入力が数値のみの場合

`re`モジュールでは，`(?P<name>regexp)`という形でマッチさせることでマッチ後に取り出すキーワードを指定できる．

In [164]:
def transform_temprature(input_string):
    prog = re.compile(r'(?P<temp>^[0-9]+$)')
    m = prog.match(input_string)
    
    if m:
        temp = int(m.group('temp'))
        trans_temp = (temp * 9 / 5) + 32
    else:
        raise ValueError('input has to be numbers')
        
    print('{celsius} C is {fahrenheit} F'.format(celsius = temp, fahrenheit = trans_temp))

In [165]:
transform_temprature('1000')

1000 C is 1832.0 F


## 入力にマイナスを許す場合  
`-?`を先頭につけることでマイナスの場合を扱える．

In [166]:
def transform_temprature(input_string):
    prog = re.compile(r'(?P<temp>^-?[0-9]+$)')
    m = prog.match(input_string)
    
    if m:
        temp = int(m.group('temp'))
        trans_temp = (temp * 9 / 5) + 32
    else:
        raise ValueError('input has to be numbers')
    
    print('{celsius} C is {fahrenheit} F'.format(celsius = temp, fahrenheit = trans_temp))

In [167]:
transform_temprature('-30')

-30 C is -22.0 F


## 入力を摂氏，華氏両方に対応する場合  

In [168]:
def calc(x, m):
    if m == 'C':
        return (x * 9 / 5) + 32
    elif m == 'F':
        return (x - 32) / 9 * 5

def transform_temprature(input_string):
    prog = re.compile(r'(?P<temp>^-?[0-9]+)\s*(?P<type>[CF])$')
    m = prog.match(input_string)
    temp = int(m.group('temp'))
    
    if m:
        if m.group('type'):
            trans_temp = calc(x = temp, m = m.group('type'))
            print('{celsius:.2f} C is {fahrenheit:.2f} F'.format(celsius = temp, fahrenheit = trans_temp))
        else:
            raise Warning('warning: default conversion method C -> F')
            trans_temp = calc(x = temp, m = 'C')
            print('{fahrenheit:.2f} F is {celsius:.2f} C'.format(fahrenheit = temp, celsius = trans_temp))

    else:
        raise ValueError('input has to be numbers')

            

In [169]:
s = '-30 C'
transform_temprature(s)

-30.00 C is -22.00 F


In [134]:
s = '22 F'
transform_temprature(s)

22.00 C is -5.56 F


## 大文字／小文字を区別しない

In [188]:
def calc(x, m):
    if m == 'C':
        return (x * 9 / 5) + 32
    elif m == 'F':
        return (x - 32) / 9 * 5

def transform_temprature(input_string):
    prog = re.compile(r'(?P<temp>^-?[0-9]+)\s*(?P<type>[CF])$', re.IGNORECASE)
    m = prog.match(input_string)
    input_value = int(m.group('temp'))
    input_type = m.group('type').upper()
    output_type = [e for e in 'CF' if e != input_type][0]
    
    if m:
        if m.group('type'):
            output_value = calc(x = input_value, m = input_type)
            print('{input_value:.2f} {input_type} is {output_value:.2f} {output_type}'.format(
                input_value = input_value,
                input_type = input_type,
                output_value = output_value,
                output_type = output_type
            ))
        else:
            raise Warning('warning: default conversion method C -> F')
            output_value = calc(x = input_value, m = 'C')
            print('{celsius:.2f} F is {fahrenheit:.2f} C'.format(fahrenheit = output_value, celsius = input_value))

    else:
        raise ValueError('input has to be numbers')

            

In [189]:
transform_temprature('-30 c')

-30.00 C is -22.00 F


In [191]:
transform_temprature('1000      f')

1000.00 F is 537.78 C


# テキストの書き換え  

In [206]:
prog = re.compile(r'\bJeff\b', re.IGNORECASE)
prog.subn(string = "jeff Friendly", repl = 'Jeffrey')

('Jeffrey Friendly', 1)

In [228]:
import urllib3
http = urllib3.PoolManager()
r = http.request('GET', 'http://regex.info/dlisting.cgi?ed=3&id=36378')
sample_text = r.data.decode('utf-8')
sample_text

'Dear =FIRST=,\nYou have been chosen to win a brand new =TRINKET=! Free!\nCould you use another =TRINKET= in the =FAMILY= household?\nYes =SUCKER=, I bet you could! Just respond by.....\n\n\n\n\n-----------------------------------------------------------------------------\nCopyright 1997-2020 Jeffrey Friedl\n'

このテキストを次のような変数の値で置き換える．

|box|replace|
|:-|:-|
|=FIRST=|given|
|=FAMILY=|family|
|=SUCKER=|given family|
|=TRINKET=|'fabulous' wunderprize|

In [231]:
given = "Tom"
family = "Cruise"
wunderprize = "100% genuine faux diamond"

In [232]:
fst = re.compile(r'=FIRST=')
fam = re.compile(r'=FAMILY=')
suc = re.compile(r'=SUCKER=')
tri = re.compile(r'=TRINKET=')

In [234]:
tmp = sample_text
tmp = fst.sub(string = tmp, repl = given)
tmp = fam.sub(string = tmp, repl = family)
tmp = suc.sub(string = tmp, repl = ' '.join([given, family]))
tmp = tri.sub(string = tmp, repl = 'fabulous ' + wunderprize)

In [235]:
tmp

'Dear Tom,\nYou have been chosen to win a brand new fabulous 100% genuine faux diamond! Free!\nCould you use another fabulous 100% genuine faux diamond in the Cruise household?\nYes Tom Cruise, I bet you could! Just respond by.....\n\n\n\n\n-----------------------------------------------------------------------------\nCopyright 1997-2020 Jeffrey Friedl\n'

# 小数点の切り捨て  

In [277]:
prog = re.compile(r'(?P<print_number>\d*\.\d\d[1-9]?)\d*')
m = prog.search('9.05000037272')

In [278]:
m.group('print_number')

'9.05'

# シンプルなメールユーティリティ  

In [30]:
import urllib3
http = urllib3.PoolManager()
r = http.request('GET', 'http://regex.info/dlisting.cgi?ed=3&id=36382')
king_in_raw = r.data.decode('utf-8')
king_in_raw

'From elvis Thu Feb 29 11:15 2007\nReceived: from elvis@localhost by tabloid.org (8.11.3) id KA8CMY\nReceived: from tabloid.org by gateway.net (8.12.5/2) id N8XBK\nTo: jfriedl@regex.info (Jeffrey Friedl)\nFrom: elvis@tabloid.org (The King)\nDate: Thu, Feb 29 2007 11:15\nMessage-Id: <2007022939939.KA8CMY@tabloid.org>\nSubject: Be seein\' ya around\nReply-To: elvis@hh.tabloid.org\nX-Mailer: Madam Zelda\'s Psychic Orb [version 3.7 PL92]\n\nSorry I haven\'t been around lately. A few years back I checked\ninto that ole heartbreak hotel in the sky, ifyaknowwhatImean.\nThe Duke says "hi".\n        Elvis\n\n\n\n\n-----------------------------------------------------------------------------\nCopyright 1997-2020 Jeffrey Friedl\n'

In [39]:
king_in = re.split(r'\n\n\n\n\n', king_in_raw)[0]
king_in

'From elvis Thu Feb 29 11:15 2007\nReceived: from elvis@localhost by tabloid.org (8.11.3) id KA8CMY\nReceived: from tabloid.org by gateway.net (8.12.5/2) id N8XBK\nTo: jfriedl@regex.info (Jeffrey Friedl)\nFrom: elvis@tabloid.org (The King)\nDate: Thu, Feb 29 2007 11:15\nMessage-Id: <2007022939939.KA8CMY@tabloid.org>\nSubject: Be seein\' ya around\nReply-To: elvis@hh.tabloid.org\nX-Mailer: Madam Zelda\'s Psychic Orb [version 3.7 PL92]\n\nSorry I haven\'t been around lately. A few years back I checked\ninto that ole heartbreak hotel in the sky, ifyaknowwhatImean.\nThe Duke says "hi".\n        Elvis'

In [40]:
re.split(r'\n', king_in)

['From elvis Thu Feb 29 11:15 2007',
 'Received: from elvis@localhost by tabloid.org (8.11.3) id KA8CMY',
 'Received: from tabloid.org by gateway.net (8.12.5/2) id N8XBK',
 'To: jfriedl@regex.info (Jeffrey Friedl)',
 'From: elvis@tabloid.org (The King)',
 'Date: Thu, Feb 29 2007 11:15',
 'Message-Id: <2007022939939.KA8CMY@tabloid.org>',
 "Subject: Be seein' ya around",
 'Reply-To: elvis@hh.tabloid.org',
 "X-Mailer: Madam Zelda's Psychic Orb [version 3.7 PL92]",
 '',
 "Sorry I haven't been around lately. A few years back I checked",
 'into that ole heartbreak hotel in the sky, ifyaknowwhatImean.',
 'The Duke says "hi".',
 '        Elvis']

In [73]:
def make_reply(mail, detectors):
    response = {}
    message = []
    is_emptyLine = re.compile(r'^(\s*|.*:.*)$')
    
    for l in re.split(r'\n', mail):
        for d_name, d_func in detectors.items():
            if d_func.search(l):
                response[d_name] = d_func.search(l).group(d_name)
        if not is_emptyLine.search(l):
            message.append('|> ' + l)
    
    response['message'] = '\n'.join(message)
    
    return response    

In [76]:
detectors = {
    'To':re.compile(r'To:\s(?P<To>.*)'),
    'From':re.compile(r'From:\s(?P<From>\S+)'),
    'Date':re.compile(r'Date:\s(?P<Date>.*)'),
    'Subject':re.compile(r'Subject:\s(?P<Subject>.*)'),
}

In [77]:
make_reply(king_in, detectors)

{'To': 'elvis@hh.tabloid.org',
 'From': 'elvis@tabloid.org',
 'Date': 'Thu, Feb 29 2007 11:15',
 'Subject': "Be seein' ya around",
 'message': '|> Sorry I haven\'t been around lately. A few years back I checked\n|> into that ole heartbreak hotel in the sky, ifyaknowwhatImean.\n|> The Duke says "hi".\n|>         Elvis'}

# 先後読みによる数値へのカンマ付け

298444215 -> 298,444,215

のような感じでカンマ区切りを挿入する．

## 先読み  

先読み構文は，テキストではなく，テキストの中の位置にマッチする．この時先読みでは，単語の境界やアンカーなどの特殊条件だけに対応するのではない．  
先読みは，テキストの先の方に対して部分正規表現がマッチするかを調べ，マッチするなら正規表現要素として成功となる．

__肯定の先読み__ は `(?=...)`という形で使用できる．例えば`(?=\d)`は次の値が数値になっている位置で成功する．  

__後読み__は既に読んだ左の文章を振り返る形で部分正規表現がマッチするかを調べる．これは`(?<=...)`という形で使用する．  
`(?<=\d)`は左側に数値がある位置で成功する．  

大事なことは，先後読みでは__テキストを消費しない__ことである．

In [79]:
# 通常の正規表現のマッチ
prog = re.compile(r'Jeffrey')
prog.search('by Jeffrey Friendl')

<re.Match object; span=(3, 10), match='Jeffrey'>

`(?=Jeffrey)Jeff`は`Jeffrey`となっている部分の先頭が`Jeff`となっている部分に限りマッチする．なので，`Jefferson`のJeffにはマッチしない．

In [85]:
# 先読みの場合
prog = re.compile(r'(?=Jeffrey)Jeff')
prog.search('by Jeffrey Jefferson')

<re.Match object; span=(3, 7), match='Jeff'>

In [89]:
prog.subn(string='by Jeffrey, Jefferson', repl='++++')

('by ++++rey, Jefferson', 1)

`(?=Jeffrey)Jeff`と`Jeff(?=rey)`は実質的に同じ効果になる．

In [90]:
prog = re.compile(r'Jeff(?=rey)')
prog.sub(string = 'Jeffrey Jefferson', repl = '++++')

'++++rey Jefferson'

## そのほかの例  
`Jeffs`を`Jeff's`に置き換えてみる．  
もちろんこれは`r'\bJeffs\b`をマッチさせて`Jeff's`に置換するだけでも良い  
`r'\b(Jeff)(s)'`で変数に入れておいて変換するのも良い．


先読みを使ってみよう

In [92]:
prog = re.compile(r'Jeff(?=s\b)')
prog.sub(string = 'Jeffs Jeffson', repl = "Jeff'")

"Jeff's Jeffson"

# 先読みと後読みを使う  
`Jeffs`を`Jeff's`に変換するには，わざわざ文字列を置換しなくでも，`Jeff`と`s`の間に`'`を挿入できれば良い，と考えれば  
`(?<=\bJeff)(s\b)`でマッチした位置を置換すれば良いと考えられる．

In [95]:
prog = re.compile(r'(?<=\bJeff)(?=s\b)')
prog.sub(string = 'Jeffs Jeffson', repl = "'")

"Jeff's Jeffson"

In [96]:
prog.sub(string = 'Jeffs Jeffs', repl = '++++')

'Jeff++++s Jeff++++s'

ちなみに，先読み後読みは順序を逆にしても適切に指定すれば結果は同じになる．

In [97]:
prog = re.compile(r'(?=s\b)(?<=\bJeff)')
prog.sub(string = 'Jeffs Jeffson', repl = "'")

"Jeff's Jeffson"

## 数値にカンマを入れる

In [101]:
s = '298444215'

カンマを入れる条件  
1. 右側に３の倍数個の数字がある: `(?=\d\d\d)+$`
1. 左側に何らかの数字がある位置: `(?<=\d)`

In [105]:
prog = re.compile(r'(?<=\d)(?=(\d\d\d)+$)')
prog.sub(string = s, repl = ',')

'298,444,215'

各３桁の数値については，キャプチャしているがこれは特に必要ないので明確な処理を記述するのであれば

In [106]:
prog = re.compile(r'(?<=\d)(?=(?:\d\d\d)+$)')
prog.sub(string = s, repl = ',')

'298,444,215'

とすると良いだろう．

# 単語の境界と否定の先後読み  

数値のカンマ付けを拡張して文章中の数字にもカンマを挿入できるようにしてみよう  

In [115]:
s = 'The population of 29844421 is growing 999999999'

In [116]:
prog = re.compile(r'(?<=\d)(?=(?:\d\d\d)+\b)')
prog.sub(string = s, repl = ",")

'The population of 29,844,421 is growing 999,999,999'

# メールアドレスのリンク化

メールアドレスにマッチする単純な正規表現  
- `\w+\@\w+(\.\w+)+`

改良版
- `\w[-.\w]*+\@[-a-z0-9]+(\.[-a-z0-9]+)*\.(com|edu|info)`

末尾の文字に否定の後読み（これで終わってはいけない文字）を追加する
- `https?://[-a-z0-9]+(\.[-a-z0-9]+*\.(com|edu|info))\b([-a-z0-9_:\@&?=+,.!~*'%\$]*(?<![.,?!]))?`