In [1]:
import pandas as pd
import numpy as np
import re
import datetime as dt
import pytz
import sys,os
ENV_PATH = '../../ENV/'
sys.path.append(ENV_PATH)
from env import ENV

__file__ = '../../classifier/models/time_pattern/'

class TimePattern:
    def __init__(self,tz=None):
        """
        tz = "Asia/Shanghai"
        tz = pytz.timezone(tz)
        'America/New_York'
        """
        self.pattern_csv = os.path.join(os.path.dirname(__file__), 'mapping.csv')
        self._set_timeZone(tz)
        self._load_mapping(self.pattern_csv)
        self.time_dic = {'今':'?','明':'+1','后':'+2','大后':'+3','下个':'+1','下下个':'+2',
                         '再下个':'+2','下下下个':'+3','后1个':'+1','后2个':'+2','后一个':'+1',
                         '后两个':'+2',
                         '一':'1','二':'2','三':'3','四':'4','五':'5',
                         '六':'6','七':'7','八':'8','九':'9','十':'10',
                         '十一':'11','十二':'12','十三':'13','十四':'14','十五':'15',
                         '十六':'16','十七':'17','十八':'18','十九':'19','二十':'20',
                         '二十一':'21','二十二':'22','二十三':'23','二十四':'24','二十五':'25',
                         '二十六':'26','二十七':'27','二十八':'28','二十九':'29','三十':'30','三十一':'31',
                         '四十':'40','五十':'50','六十':'60','七十':'70','八十':'80','九十':'90','一百':'100'}
        self.fix_ymd = r'(?:(?:今|明|后|大后)年)?(?:(?:\d{1,2}|下下下个|下下个|再下个|下个|十一|十二|一|二|三|四|五|六|七|八|九|十|后1个|后2个|后一个|后两个|后二个)月)(?:\d{1,2}|一|二|三|四|五|六|七|八|九|十|十一|十二|十三|十四||十六|十七|十八|十九|二十|二十一|二十二|二十三|二十四|二十五|二十六|二十七|二十八|二十九|三十|三十一)[日号]'
        reg_num = r'(?:(?:一|二|三|四|五|六|七|八|九|十|十一|十二|十三|十四|十五|二十|三十|四十|五十|六十|七十|八十|九十)|\d+)'
        self.fix_period = r'(?:{}天)?(?:{}小时)?(?:{}分钟)?'.format(reg_num,reg_num,reg_num)
        
    def remove_time(self,sentence):
        sentence = re.sub(r' ','',sentence)
        sentence = re.sub(self.re_ext,' ',sentence)
        
        finds_ymd = set(re.findall(self.fix_ymd,sentence)) - set([''])
        finds_period = set(re.findall(self.fix_period,sentence)) - set([''])
        if len(finds_ymd) > 0:
            sentence = re.sub('|'.join(finds_ymd),' ',sentence)
        if len(finds_period) > 0:
            sentence = re.sub('|'.join(finds_period),' ',sentence)

        if sentence == '':
            return ' '
        return sentence
        
    
    def process(self, sentence):
        current = self._get_LocalNow()
        sentence = re.sub(r" ",'',sentence)
        fixymd = self.evl_ymd(sentence)
        selfdefine = re.findall(self.re_ext, sentence)
        period = self.evl_period(sentence)
        result = []
        for each in fixymd:
            future = self.evl(each['expression'])
            gap = (future - current).total_seconds()
            result.append({'pattern':each['pattern'], 'time':future, 'gapS':gap, 'gapH':gap/3600})
        for each in selfdefine:
            print(self.dict_ext[each])
            future = self.evl(self.dict_ext[each])
            gap = (future - current).total_seconds()
            result.append({'pattern':each, 'time':future, 'gapS':gap, 'gapH':gap/3600})
        for each in period:
            future = self.evl(each['expression'])
            gap = (future - current).total_seconds()
            result.append({'pattern':each['pattern'], 'time':future, 'gapS':gap, 'gapH':gap/3600})
        return result
    
    def evl(self, expression):
        current = self._get_LocalNow()
        exp_week = re.findall(r'-.+W-.+w',expression)
        exp_ymd = re.findall(r'.+y-.+m.+d',expression)

        history = self._pros_second(expression, current)
        history = self._pros_minute(expression, current, history)
        history,shift = self._pros_hour(expression, current, history)

        if exp_week:
            history = self._pros_weekDay(expression,current, history)
            history = self._pros_week(expression,current, history)
            history = self._pros_year(expression, current,history)
            future = self.create_from_W(history)  
        elif exp_ymd:
            history = self._pros_day(expression, current,history)
            history = self._pros_month(expression, current,history)
            history = self._pros_year(expression, current,history)
            future = self.create_from_D(history)
        if future.tzinfo is None:
            future = self.tz.localize(future)
        return future
    
    def _load_mapping(self, pattern_path):
        df = pd.read_csv(pattern_path)
        # create length
        df['length'] = df.key_word.apply(lambda x: len(x))
        df = df.sort_values(['length','key_word'], ascending=False)
        df_series = pd.Series(index=df.key_word.values, data=df.expression.values)
        df_dict = df_series.to_dict()
        self.serires = df_series
        self.re_ext = r'|'.join(self.serires.index.values)
        self.dict_ext = df_dict
        
    def _set_timeZone(self,tz=None):
        if tz is None:
            tz = ENV.TIMEZONE.value
            print('Time Zone is set from ENV: {}'.format(tz))
        tz = ENV.TIMEZONE.value
        self.tz = pytz.timezone(tz)
        self.delta = self.tz.utcoffset(dt.datetime.utcnow())
        
        
    def _get_LocalNow(self):
        now = dt.datetime.utcnow()
        return self.tz.localize(now) + self.delta
        
    def _pros_second(self, expression, current, history={'microsecond':0}):
        history = history.copy()
        S = current.second
        reexp = r'M:.+S'
        extract = re.findall(reexp,expression)[0]
        # M:+1S
        if extract[2:-1] == '?':
            history.update({'second':S})
            return history
        elif extract[2] == '+':
            gap = int(extract[3:-1])
            create = current + dt.timedelta(seconds=gap) 
            create = create.replace(**history)
            return create 
        elif extract[2] == '-':
            gap = int(extract[3:-1])
            create =  current - dt.timedelta(seconds=gap)
            create = create.replace(**history)
            return create
        else:
            second = int(extract[2:-1])
            history.update({'second':second})
            return history
        
    def _pros_minute(self, expression, current, history = {}):
        if isinstance(history,dt.datetime):
            return history
        history = history.copy()
        M = current.minute
        reexp = r'H:.+M'
        extract = re.findall(reexp,expression)[0]
        # H:?M
        if extract[2:-1] == '?':
            history.update({'minute':M})
            return history
        elif extract[2] == '+':
            gap = int(extract[3:-1])
            create = current + dt.timedelta(minutes=gap) 
            create = create.replace(**history)
            return create
        elif extract[2] == '-':
            gap = int(extract[3:-1])
            create = current - dt.timedelta(minutes=gap) 
            create = create.replace(**history)
            return create
        else:
            minute = int(extract[2:-1])
            history.update({'minute':minute})
            return history
        
    def _pros_hour(self, expression, current, history = {}):
        shift = True
        if isinstance(history,dt.datetime):
            return history, shift
        history = history.copy()

        H = current.hour
        reexp = r'[dw]-.+H'
        extract = re.findall(reexp,expression)[0]
    #     d-?H
        if extract[2:-1] == '?':
            history.update({'hour':H})
            return history,shift
        elif extract[2] == '+':
            gap = int(extract[3:-1])
            create = current + dt.timedelta(hours=gap)
            create = create.replace(**history)
            return create,shift
        elif extract[2] == '-':
            gap = int(extract[3:-1])
            create = current - dt.timedelta(hours=gap)
            create = create.replace(**history)
            return create,shift
        else:
            shift = False
            hour = int(extract[2:-1])
            history.update({'hour':hour})
            return history,shift
        
    def _pros_day(self, expression, current, history={}):
        if isinstance(history,dt.datetime):
            return history
        history = history.copy()
        d = current.day
        reexp = r'm-.+d'
        extract = re.findall(reexp,expression)[0]
        if extract[2:-1] == '?':
            history.update({'day':d})
            return history
        elif extract[2] == '+':
            gap = int(extract[3:-1])
            create = current + dt.timedelta(days=gap) 
            create = create.replace(**history)
            return create
        elif extract[2] == '-':
            gap = int(extract[3:-1])
            create = current - dt.timedelta(days=gap) 
            create = create.replace(**history)
            return create
        else:
            day = int(extract[2:-1])
            history.update({'day':day})
            return history
        
    def _pros_month(self, expression, current, history):
        if isinstance(history,dt.datetime):
            return history
        history = history.copy()
        adjust_year = 0
        m = current.month
        reexp = r'y-.+m'
        extract = re.findall(reexp,expression)[0]
        if extract[2:-1] == '?':
            history.update({'month':m})
            return history
        elif extract[2] == '+':
            cur = int(extract[3:-1]) + m
            if cur > 12:
                adjust_year = int(cur / 12)
                cur = cur % 12
                if cur == 0:
                    cur = 12
                    adjust_year -= 1
            history.update({'year':adjust_year})
            history.update({'month':cur})
            return history
        elif extract[2] == '-':
            cur = m - int(extract[3:-1])
            if cur < 1:
                adjust_year = int(cur / 12) - 1
                cur = cur % 12
                if cur == 0:
                    cur = 12
            history.update({'year':adjust_year})
            history.update({'month':cur})
            return history
        else:
            history.update({'month':int(extract[2:-1])})
            return history
        
    def _pros_year(self, expression, current, history):
        if isinstance(history,dt.datetime):
            return history
        history = history.copy()
        adjust_year = history.get('year')
        if adjust_year is None:
            adjust_year = 0
        y = current.year
        reexp = r'.+y-'
        extract = re.findall(reexp,expression)[0]
        if extract[0:-2] == '?':
            history.update({'year':y+adjust_year})
            return history
        elif extract[0] == '+':
            gap = int(extract[1:-2])
            history.update({'year':y+adjust_year+gap})
            return history
        elif extract[0] == '-':
            gap = int(extract[1:-2])
            history.update({'year':y+adjust_year-gap})
            return history
        else:
            history.update({'year':int(extract[:-2])+adjust_year})
            return history
        
    def _pros_weekDay(self, expression, current, history):
        history = history.copy()
        w = current.isocalendar()[2] % 7
        reexp = r'W-.+w'
        extract = re.findall(reexp,expression)[0]

        # W-+1w
        if extract[2:-1] == '?':
            history.update({'weekday':str(w)})
            return history
        elif extract[2] == '+':
            rep = str(w + int(extract[3:-1]))
            history.update({'weekday':rep})
            return history
        elif extract[2] == '-':
            rep = str(w - int(extract[3:-1]))
            history.update({'weekday':rep})
            return history
        else:
            rep = extract[2:-1]
            history.update({'weekday':rep})
            return history
        
    def _pros_week(self, expression, current, history):
        history = history.copy()
        year_adjust = 0
        W = current.isocalendar()[1] 
        reexp = r'y-.+W'
        extract = re.findall(reexp,expression)[0]

        # y-+1W
        if extract[2:-1] == '?':
            rep = str(W)
            history.update({'week':rep})
            return history
        elif extract[2] == '+':
            cur = W + int(extract[3:-1])
            if cur > 53:
                year_adjust = int(cur / 53)
                cur = cur % 53
            rep = str(cur)
            history.update({'year':year_adjust})
            history.update({'week':rep})
            return history
        elif extract[2] == '-':
            cur = W - int(extract[3:-1])
            if cur < 0:
                year_adjust = int(cur / 53) -1
                cur = cur % 53
            rep = str(cur)
            history.update({'year':year_adjust})
            history.update({'week':rep})
            return history
        else:
            rep = extract[2:-1]
            history.update({'week':rep})
            return history
    
    def create_from_D(self, history):
        if isinstance(history,dt.datetime):
            return history
        return dt.datetime(**history)
    
    def create_from_W(self, history):
        expression ='{}y-{}W-{}w-{}H:{}M:{}S'.format(history['year'],
                                                     history['week'],
                                                     history['weekday'],
                                                     history['hour'],
                                                     history['minute'],
                                                     history['second'])
        eval_time = dt.datetime.strptime(expression, "%Yy-%WW-%ww-%HH:%MM:%SS")
        return eval_time
    
    def ymd_reg(self,x):
        
        finds = list(set(re.findall(self.fix_ymd,x)) -set(['']))
        return finds

    def ymd_expression(self, result):
        def get_key(x):
            if x.isdigit():
                return x
            else:
                gets = self.time_dic.get(x)
                if x is None:
                    return '?'
                else:
                    return gets


        year_index = result.find('年')
        month_index = result.find('月')
        if result.find('日') != -1:
            date_index = result.find('日')
        else:
            date_index = result.find('号')

        if year_index != -1:
            year_key = result[0:year_index]  
            year = get_key(year_key)
        else:
            year = '?'
        if month_index != -1:
            month_key = result[year_index+1:month_index]
            month = get_key(month_key)
        else:
            month = '?'
        if date_index != -1:
            date_key = result[month_index+1:date_index]
            date = get_key(date_key)
        else:
            date = '?'
        formatted = '{}y-{}m-{}d-12H:00M:00S'.format(year, month, date)
        return formatted
    
    def evl_ymd(self,text):
        finds = self.ymd_reg(text)
        evls = []
        if len(finds) == 0:
            return evls
        else:
            for each in finds:
                evls.append({'pattern':each, 'expression':self.ymd_expression(each)})
        return evls
   

    def period_reg(self,x):
        
        finds = list(set(re.findall(self.fix_period,x)) - set(['']))
        return finds
    
    def period_expression(self,find):
        day_index = find.find('天')
        hour_index = find.find('小时')
        minute_index = find.find('分钟')
        if day_index != -1:
            day = find[0:day_index]
            if self.time_dic.get(day) is not None:
                day = int(self.time_dic.get(day))
            else:
                try:
                    day = int(day)
                except Exception:
                    day = 0
        else:
            day = 0

        if hour_index != -1:
            hour = find[day_index+1:hour_index]
            if self.time_dic.get(hour) is not None:
                hour = int(self.time_dic.get(hour))
            else:
                try:
                    hour = int(hour)
                except Exception:
                    hour = 0
        else:
            hour = 0

        if minute_index != -1:
            minute = find[hour_index+2:minute_index]
            if self.time_dic.get(minute) is not None:
                minute = int(self.time_dic.get(minute))
            else:
                try:
                    minute = int(minute)
                except Exception:
                    minute = 0
        else:
            minute = 0

        delta_minute = 24*60*day + 60*hour +minute 
        formatted = '?y-?m-?d-?H:+{}M:00S'.format(delta_minute)
        return formatted
    
    def evl_period(self,text):
        finds = self.period_reg(text)
        evls = []
        if len(finds) == 0:
            return evls
        else:
            for each in finds:
                evls.append({'pattern':each, 'expression':self.period_expression(each)})
        return evls



        
        
        
    def test_case1(self):
        """
        test if there is any overlab between self-defined and the fixed expression
        """
        error_result = []
        for each_pattern in self.serires.index.values:
            fixymd = self.evl_ymd(each_pattern)
            if len(fixymd) > 0:
                pattern = fixymd[0]['pattern']
#                 if pattern == each_pattern:
                error_result.append(each_pattern)
        print('============ test case 1 is below ==============')
        print(error_result)
        
    def test_case2(self):
        """
        This test is used to test all self define mapping;
        check is the evl string is correct
        """
        error_result = []
        for key in self.dict_ext:
            try:
                evl = self.evl(self.dict_ext[key])
            except Exception:
                error_result.append(key)
                print(key)
        print('============ test case 2 is below ==============')
        print(error_result )
        

In [2]:
t = TimePattern()

Time Zone is set from ENV: Asia/Shanghai


In [3]:
t.process('一个月')

?y-?m-+30d-15H:00M:00S


[{'pattern': '一个月',
  'time': datetime.datetime(2018, 10, 5, 15, 0, tzinfo=<DstTzInfo 'Asia/Shanghai' CST+8:00:00 STD>),
  'gapS': 2631422.563428,
  'gapH': 730.9507120633334}]

In [4]:
#after period 
s1 = '三天2小时5分钟后就还'
s2 = '四小时后就还'
s3 = '2小时就还'
s4 = '  哈哈水电费'

In [5]:
t.remove_time(s1)

' 后就还'

In [6]:
reg_num = r'(?:(?:一|二|两|三|四|五|六|七|八|九|十|十一|十二|十三|十四|十五|二十|三十|四十|五十|六十|七十|八十|九十)|\d+)'
reg = r'(?:{}(?:天|日){}小时)|(?:{}(?:天|小时|日))'.format(reg_num,reg_num,reg_num)
finds = re.findall(reg,s1)


In [7]:
reg_num = r'(?:(?:一|二|两|三|四|五|六|七|八|九|十|十一|十二|十三|十四|十五|二十|三十|四十|五十|六十|七十|八十|九十)|\d+)'
reg = r'(?:{}(?:天|日))?(?:{}小时)?(?:{}分钟)?'.format(reg_num,reg_num,reg_num)
finds = list(set(re.findall(reg,s1)) - set(['']))

In [8]:
finds

['三天2小时5分钟']

In [9]:
mapping = {'今':'?','明':'+1','后':'+2','大后':'+3','下个':'+1','下下个':'+2',
                         '再下个':'+2','下下下个':'+3','后1个':'+1','后2个':'+2','后一个':'+1',
                         '后两个':'+2',
                         '一':'1','二':'2','三':'3','四':'4','五':'5',
                         '六':'6','七':'7','八':'8','九':'9','十':'10',
                         '十一':'11','十二':'12','十三':'13','十四':'14','十五':'15',
                         '十六':'16','十七':'17','十八':'18','十九':'19','二十':'20',
                         '二十一':'21','二十二':'22','二十三':'23','二十四':'24','二十五':'25',
                         '二十六':'26','二十七':'27','二十八':'28','二十九':'29','三十':'30','三十一':'31',
                         '四十':'40','五十':'50','六十':'60','七十':'70','八十':'80','九十':'90','一百':'100'}

In [10]:
find = '三天2小时5分钟'
day_index = find.find('天')
if day_index == -1:
    day_index = find.find('日')
hour_index = find.find('小时')
minute_index = find.find('分钟')
if day_index != -1:
    day = find[0:day_index]
    if mapping.get(day) is not None:
        day = int(mapping.get(day))
    else:
        try:
            day = int(day)
        except Exception:
            day = 0
else:
    day = 0
    
if hour_index != -1:
    hour = find[day_index+1:hour_index]
    if mapping.get(hour) is not None:
        hour = int(mapping.get(hour))
    else:
        try:
            hour = int(hour)
        except Exception:
            hour = 0
else:
    hour = 0
    
if minute_index != -1:
    minute = find[hour_index+2:minute_index]
    if mapping.get(minute) is not None:
        minute = int(mapping.get(minute))
    else:
        try:
            minute = int(minute)
        except Exception:
            minute = 0
else:
    minute = 0
    
delta_minute = 24*60*day + 60*hour +minute 
formatted = '?y-?m-?d-?H:+{}M:00S'.format(delta_minute)
    




In [139]:
formatted

'?y-?m-?d-?H:+4445M:00S'

In [140]:
t.evl(formatted)

datetime.datetime(2018, 8, 11, 15, 58, tzinfo=<DstTzInfo 'Asia/Shanghai' CST+8:00:00 STD>)

In [54]:
a = set(['1','2','3'])
a

{'1', '2', '3'}

In [55]:
b = '|'

In [56]:
b.join(a)

'1|2|3'

In [41]:
re.findall(r'','明天下午就还}')

['', '', '', '', '', '', '', '']

# 2018 0808 DEV

## case1 - 某年某月某日

In [555]:

num_cn = r'[一二三四五六七八九十零〇两百千0-9]'
rela_month = r'(?:下下下|下下|再下|下|(?:{}+))(?:个)?月'.format(num_cn)
fix_month = r'(?:{}+)月'.format(num_cn)
month_exp = r'(?:{}|{})'.format(rela_month,fix_month)


rela_date = '(?:今|明|后|大后|大大后|再后|(?:{})+)天'.format(num_cn)
date_descrip = r'(?:上旬|中旬|下旬|月初|月中|月末|初|中|底|末)'
# fix_date = r'(?:{}+)(?:日|号)'.format(num_cn)
fix_date = r'(?:(?:{}+)(?:日|号))|{}'.format(num_cn,date_descrip)
date_exp = r'(?:{}|{})'.format(rela_date,fix_date)

rela_year = r'(?:今|明|后|后后|再后|下一|再下一|下|再下|(?:{}))年'.format(num_cn)
fix_year = r'(?:{}+)年'.format(num_cn)
year_exp = r'(?:{}|{})'.format(rela_year,fix_year)

rela_hour= r'(?:{}+(?:个)?)(?:小时|钟头)'.format(num_cn)
hour_descrip = r'早晨|凌晨|早上|半夜|中午|下午|傍晚|晚上|清晨|午后'
fix_minute = r'(?:(?:过)?一刻|(?:过)?{}+(?:分)?|半(?:过)?|过)+'.format(num_cn)
fix_hour= r'(?:(?:{})|(?:(?:{}+)点(?:{})?))+'.format(hour_descrip,num_cn,fix_minute)
hour_exp = r'(?:{}|{})'.format(rela_hour,fix_hour)




# only process relative time
rela_minute = r'(?:{}+|几)分钟|一刻钟'.format(num_cn)
minute_exp = r'(?:{})'.format(rela_minute)




rela_week = r'(?:(?:这|下|再下|下下|再后)(?:{}+)?(?:个)?)|(?:{}+)'.format(num_cn,num_cn)
weekDay = r'[1-7天一二三四五六七]'
week_unit = r'周|星期|礼拜'
exp_week = r'(?:(?:{rw})?(?:{wu})+(?:{wd})?(?:{h})?(?:{mm})?)+'.format(rw=rela_week,wu=week_unit,wd=weekDay,
                                                              h=hour_exp,mm=minute_exp)
exp_ymd = r'(?:(?:{y})|(?:{m})|(?:{d})|(?:{h})|(?:{mm}))+'.format(y=year_exp,m=month_exp,
                                                                 d=date_exp,h=hour_exp,
                                                                 mm=minute_exp)


In [556]:
re.findall(minute_exp,'给我一刻钟')

['一刻钟']

In [12]:

finds1 = re.findall(exp_ymd,'我下个月10号可以还')[0]
finds2 = re.findall(exp_ymd,'我1月10号二零年可以还')[0]
finds2

'1月10号二零年'

### process year

In [13]:
#extract year

def YearExtractor(sentence):
    #########
    #######define
    
    def Year2Exp(text):
        """
        text has format of "<x>年"
        output: extracted expression
        """
        year_map = {'今':'?','明':'+1','后':'+2','后后':'+3',
                    '再后':'+3','下一':'+1','下':'+1','再下一':'+2','再下':'+2'}
        def CHN2Year(x):
            ##TODO: more case needs to be handled
            # case 1. digits
            if x.isdigit():
                # case 1.1 length is 4. eg 2017
                if len(x) == 4:
                    return x
                # case 1.2 length is not 4. which may mean calcuate delta
                else:
                    return '+'+x
            # needs to handle more cases
            else:
                return '?'
        index = text.find('年')
        y = None
        if index == -1:
            y = '?'
        else:
            context = text[:index]
            # step 1. get mapping from year_map
            if year_map.get(context) is not None:
                y = year_map.get(context)
            # step 2. CHN to canlender year    
            else:
                y = CHN2Year(context)
        return y
    ########################## end define ############################
    extracted_list = re.findall(year_exp,sentence)
    finds_list = []
    if len(extracted_list) > 0:
        for each in extracted_list:
            try:
                y = Year2Exp(each)
            except Exception:
                y = '?'
            finally:
                finds_list.append((each,y))
    else:
        y ='?'
        finds_list.append((None,y))
    return finds_list

# Process Month

In [17]:
finds2 = '下个月'
print(finds2)
print(re.findall(month_exp,finds2))
sentence = finds2
year_extracted_list = re.findall(month_exp,sentence)[0]

下个月
['下个月']


In [18]:
# Month2Exp
text = '五月'
c = CHN2NUM()
def MonthExtractor(sentence):
    """
    extract month info and convert to expression
    """
    def Month2Exp(text):
        month_map = {'下下下':'+3','下下':'+2','再下':'+2','下':'+1'}
        def CHN2Month(x,rela=True):
            """
            CHN2Month("二",rela=True)   --> +2
            CHN2Month("二",rela=False)   --> 2
            CHN2Month("二十",rela=False) --> +20
            """
            if x.isdigit():
                if int(x) > 12 or rela:
                    return '+'+x
                else:
                    return x
            # if not digit, then transform        
            else:
                try:
                    x = str(c.transform(x))
                    if int(x) > 12 or rela:
                        x = '+'+x                
                except Exception:
                    x = '?'
                finally:
                    return x
        index = text.find('月')
        m = None
        if index == -1:
            m = '?'
        else:
            # step 1. get mapping from month_map
            index_rala = text.find('个月')
            if index_rala != -1:
                index = index_rala
            content = text[:index]
            if month_map.get(content) is not None:
                m = month_map.get(content)

            # step 2. CHN to canlender month
            else:
                if index_rala != -1:
                    ## process relative month logic
                    m = CHN2Month(content,rela=True)
                else:
                    ## process nonrelative month logic
                    m = CHN2Month(content,rela=False)
        return m
    ################## end define ############
    extracted_list = re.findall(month_exp,sentence)
    finds_list = []
    if len(extracted_list) > 0:
        for each in extracted_list:
            try:
                m = Month2Exp(each)
            except Exception:
                m = '?'
            finally:
                finds_list.append((each,m))
    else:
        m = '?'
        finds_list.append((None,m))
    return finds_list

In [19]:
MonthExtractor('我1月就还')

[('1月', '1')]

# Process Date

In [96]:
rela_date = '(?:今|明|后|大后|大大后|再后|(?:{})+)天'.format(num_cn)
date_descrip = r'(?:上旬|中旬|下旬|月初|月中|月末|初|中|底|末)'
# fix_date = r'(?:{}+)(?:日|号)'.format(num_cn)
fix_date = r'(?:(?:{}+)(?:日|号))|{}'.format(num_cn,date_descrip)
date_exp = r'(?:{}|{})'.format(rela_date,fix_date)

month_flag = True

def DateExtractor(sentence,month_flag = True):
    """
    extract month info and convert to expression
    """
    def Date2Exp(text):
        date_map = {'今':'?','明':'+1','后':'+2','大后':'+3','大大后':'+4','再后':'+3',
                    '上旬':'10','中旬':'15','下旬':'28','月初':'10','月中':'15','月末':'28',
                    '初':'10','中':'15','底':'28','末':'28'}
        def CHN2Date(x,rela=True):
            """
            CHN2Date("二",rela=True)   --> +2
            CHN2Date("二",rela=False)   --> 2
            CHN2Date("二十",rela=False) --> +20
            """
            if x.isdigit():
                if int(x) > 31 or rela:
                    return '+'+x
                else:
                    return x
            # if not digit, then transform        
            else:
                try:
                    x = str(c.transform(x))
                    if int(x) > 31 or rela:
                        x = '+'+x                
                except Exception:
                    x = '?'
                finally:
                    return x
        # case 1. 
        if text.find('号') != -1:
            index_date = text.find('号')
            d = CHN2Date(text[:index_date],rela=False)
        elif text.find('天') != -1:
            index_date = text.find('天')
            descrip = text[:index_date]
            d = date_map.get(descrip)
            if d is None:
                d = CHN2Date(descrip,rela=True)
        elif text.find('日') != -1:
            index_date = text.find('日')
            descrip = text[:index_date]
            if month_flag:
                d = CHN2Date(text[:index_date],rela=False)
            else:
                d = CHN2Date(text[:index_date],rela=True)
                
        else:
            d = '?'
        return d
    ################## end define ############
    extracted_list = re.findall(date_exp,sentence)
    finds_list = []
    if len(extracted_list) > 0:
        for each in extracted_list:
            try:
                d = Date2Exp(each)
            except Exception:
                d = '?'
            finally:
                finds_list.append((each,d))
    else:
        d = '?'
        finds_list.append((None,d))
    return finds_list

    
    
            
   
  


In [99]:
DateExtractor('两日后就还',month_flag=False)

[('两日', '+2')]

# Process Hour

In [540]:
rela_hour= r'(?:{}+(?:个)?(?:半)?)(?:小时|钟头)'.format(num_cn)
hour_descrip = r'早晨|凌晨|早上|半夜|中午|下午|傍晚|晚上|清晨|午后'
fix_minute = r'(?:(?:过)?一刻|(?:过)?{}+(?:分)?|半(?:过)?|过)+'.format(num_cn)
fix_hour= r'(?:(?:{})|(?:(?:{}+)点(?:{})?))+'.format(hour_descrip,num_cn,fix_minute)
hour_exp = r'(?:{}|{})'.format(rela_hour,fix_hour)

In [478]:
re.findall(hour_exp,'3个半小时')

['3个半小时']

In [371]:
%%timeit
hour_map = {'早晨':11,'凌晨':6,'早上':11,'半夜':6,'清晨':11,
                        '中午':14,'下午':18,'傍晚':21,'晚上':23,'午后':15,}
reg_hour = r'|'.join(hour_map.keys())

685 ns ± 71.1 ns per loop (mean ± std. dev. of 7 runs, 1000000 loops each)


In [541]:


def HourExtractor(sentence,date_flag=None):
    """
    extract hour info and convert to expression
    date_flag will just be used as a placeholder
    3点15，
    3点过一刻
    can be extracted
    """
    def Minute2Exp(text):
        minute_map = {'过':'30','一刻':'15','半':'30','半过':'45'}
        reg_period = r'过|分|钟'
        def CHN2Minute(x,rela=True):
            """
            CHN2Minute("二",rela=True)   --> +2
            CHN2Minute("二",rela=False)   --> 2
            CHN2Minute("二十",rela=False) --> +20
            """
            if x.isdigit():
                if int(x) > 59 or rela:
                    return '+'+x
                else:
                    return x
            # if not digit, then transform        
            else:
                try:
                    x = str(c.transform(x))
                    if int(x) > 59 or rela:
                        x = '+'+x                
                except Exception:
                    x = '00'
                finally:
                    return x
        # case 1. can get value from defined mapping
        if minute_map.get(text) is not None:
            mm = minute_map.get(text)
        # case 2. need to remove some words
        else:
            text_sub = re.sub(reg_period,'',text)
            if minute_map.get(text_sub) is not None:
                mm = minute_map.get(text_sub)
            # cannot get value from defined mapping
            else:
                try:
                    mm = CHN2Minute(text_sub, rela=False)
                except:
                    mm = '00'
        return mm
    
    def Hour2Exp(text):
            hour_map = {'早晨':11,'凌晨':6,'早上':11,'半夜':6,'清晨':11,
                        '中午':14,'下午':18,'傍晚':21,'晚上':23,'午后':15,}
            reg_period = r'(?:个)?(?:小时|钟头)'
            def CHN2Hour(x,rela=True):
                """
                CHN2Date("二",rela=True)   --> +2
                CHN2Date("二",rela=False)   --> 2
                CHN2Date("二十",rela=False) --> +20
                """
                if x.isdigit():
                    if int(x) > 24 or rela:
                        return '+'+x
                    else:
                        return x
                # if not digit, then transform        
                else:
                    try:
                        x = str(c.transform(x))
                        if int(x) > 24 or rela:
                            x = '+'+x                
                    except Exception:
                        x = '?'
                    finally:
                        return x
            def ShiftTime(x,am_pm):
                am_pm = am_pm//12
                if x.isdigit():
                    x = int(x) 
                    if x < 12 and am_pm == 1:
                        x += 12
                    return str(x)
                else:
                    return x

            #1. Judge in the morning or afternoon. Then remove the description words
            des_t = re.findall(hour_descrip,text)
            if len(des_t) > 0:
                des_t = des_t[0]
                hours = hour_map.get(des_t)
                if hours is None:
                    hours = 17
                text = re.sub(hour_descrip,'',text)
            else:
                hours = 17
            mm = '?'
            # case 1.  几点
            
            if len(re.findall(r'点',text)) > 0:
                index_hour = text.find('点')
                h = CHN2Hour(text[:index_hour],rela=False)
                h = ShiftTime(h,hours)
                mm = Minute2Exp(text[index_hour+1:])
                print('mm is {}'.format(mm))
            # case 2. 几个小时
            elif len(re.findall(reg_period,text)) > 0:
                replace = '!H!'
                if text.find('半') != -1:
                    mm = '+30'
                    text = re.sub(r'半','',text)
                finds = re.sub(reg_period,replace,text)
                index_hour = finds.find(replace)
                h = CHN2Hour(finds[:index_hour],rela=True)
            else:
                h = str(hours)
            return h,mm
    ################## end define ############
    extracted_list = re.findall(hour_exp,sentence)
    finds_list = []
    if len(extracted_list) > 0:
        for each in extracted_list:
            try:
                h,mm = Hour2Exp(each)
            except Exception as e:
                h = '?'
                mm = '?'
            finally:
                finds_list.append((each,h,mm))
    else:
        h = '?'
        mm = '?'
        finds_list.append((None,h,mm))
    return finds_list
        

In [542]:
HourExtractor('过24小时')

[('24小时', '+24', '?')]

# Process Minute

In [360]:
re.findall(fix_minute,'下午3点过')

['过']

In [568]:
# only process relative time
rela_minute = r'(?:{}+|几)分钟|一刻钟'.format(num_cn)
minute_exp = r'(?:{})'.format(rela_minute)

In [569]:
def MinuteExtractor(sentence,hour_flag=None):
    """
    just extract relative time. fix time will be handled by hour
    """
    def Minute2Exp(text):
        minute_map = {'一刻钟':'+15'}
        def CHN2Minute(x,rela=True):
            """
            CHN2Minute("二",rela=True)   --> +2
            CHN2Minute("二",rela=False)   --> 2
            CHN2Minute("二十",rela=False) --> +20
            """
            if x.isdigit():
                if int(x) > 59 or rela:
                    return '+'+x
                else:
                    return x
            # if not digit, then transform        
            else:
                try:
                    x = str(c.transform(x))
                    if int(x) > 59 or rela:
                        x = '+'+x                
                except Exception:
                    x = '00'
                finally:
                    return x
        # case 1. can get value from defined mapping
        if minute_map.get(text) is not None:
            mm = minute_map.get(text)
        # case 2. need to remove some words
        else:
            index_minute = text.find('分钟')
            if index_minute != -1:
                mm = CHN2Minute(text[:index_minute],rela=True)
            else:
                mm = '00'
        return mm
    ################## end define ############
    extracted_list = re.findall(minute_exp,sentence)
    finds_list = []
    if len(extracted_list) > 0:
        for each in extracted_list:
            try:
                mm = Minute2Exp(each)
            except Exception as e:
                mm = '00'
            finally:
                finds_list.append((each,mm))
    else:
        mm = '00'
        finds_list.append((None,mm))
    return finds_list
        

In [573]:
MinuteExtractor('再给我15分钟')

[('15分钟', '+15')]

# Process Week

In [None]:
rela_week = r'(?:(?:这|下|再下|下下|再后)(?:{}+)?(?:个)?)|(?:{}+)'.format(num_cn,num_cn)
weekDay = r'[1-7天一二三四五六七]'
week_unit = r'周|星期|礼拜'
exp_week = r'(?:(?:{rw})?(?:{wu})+(?:{wd})?(?:{h})?(?:{mm})?)+'.format(rw=rela_week,wu=week_unit,wd=weekDay,
                                                              h=hour_exp,mm=minute_exp)

In [582]:
re.findall(exp_week,'过3周')

['3周']

In [603]:
def WeekExtractor(sentence):
    def Week2Exp(text):
        replace_wu = '!wu!'
        reg_wu = r'(?:{})+'.format(replace_wu)
        rela_week_map = {'这':'?','下':'+1','再下':'+2','下下':'+2','再后':'+2'}
        def CHN2Week(x,rela=True):
            """
            CHN2Minute("二",rela=True)   --> +2
            CHN2Minute("二",rela=False)   --> 2
            CHN2Minute("二十",rela=False) --> +20
            """
            if x.isdigit():
                if rela:
                    return '+'+x
                else:
                    return x
            # if not digit, then transform        
            else:
                try:
                    x = str(c.transform(x))
                    if rela:
                        x = '+'+x                
                except Exception:
                    x = '?'
                finally:
                    return x
        # replace key word

        text = re.sub(week_unit,replace_wu,text)
        text = re.sub(reg_wu,replace_wu,text)
        # no such pattern
        index_wu = text.find(replace_wu)
        if index_wu == -1:
            W = '?'
            w = '?'
        else:
            Week_info = text[:index_wu]
            Week_info = re.sub(r'个','',Week_info)
            WeekDay_info = text[index_wu+len(replace_wu):]
            if len(Week_info) > 0:
                if rela_week_map.get(Week_info) is not None:
                    W = rela_week_map.get(Week_info)
                else:
                    W = CHN2Week(Week_info,rela=True)
            else:
                W = '?'
            if len(WeekDay_info) > 0:
                w = CHN2Week(WeekDay_info,rela=False)
            else:
                w = '?'
        return W,w
    ################## end define ############
    extracted_list = re.findall(exp_week,sentence)
    finds_list = []
    if len(extracted_list) > 0:
        for each in extracted_list:
            try:
                W,w = Week2Exp(each)
            except Exception as e:
                W = '?'
                w = '?'
            finally:
                finds_list.append((each,W,w))
    else:
        W = '?'
        w = '?'
        finds_list.append((None,W,w))
    return finds_list
        
                

In [607]:
WeekExtractor('再下个周一')

再下
一


[('再下个周一', '+2', '1')]

In [29]:
sys.path.append('../../Lib')
from LOG import Logger

class CHN2NUM:
    def __init__(self):
        self.CN_NUM = {
    '〇' : 0, '一' : 1, '二' : 2, '三' : 3, '四' : 4, '五' : 5, '六' : 6, '七' : 7, '八' : 8, '九' : 9, '零' : 0,
    '壹' : 1, '贰' : 2, '叁' : 3, '肆' : 4, '伍' : 5, '陆' : 6, '柒' : 7, '捌' : 8, '玖' : 9, '貮' : 2, '两' : 2
}

        for i in range(10):
            self.CN_NUM[str(i)] = i
        self.CN_UNIT = {
                            '十' : 10,
                            '拾' : 10,
                            '百' : 100,
                            '佰' : 100,
                            '千' : 1000,
                            '仟' : 1000,
                            '万' : 10000,
                            '萬' : 10000,
                            '亿' : 100000000,
                            '億' : 100000000,
                            '兆' : 1000000000000,
                        }

    def transform(self,cn:str):
        unit = 0   # current
        ldig = []  # digest
        for cndig in reversed(cn):
            if cndig in self.CN_UNIT:
                unit = self.CN_UNIT.get(cndig)
                if unit == 10000 or unit == 100000000:
                    ldig.append(unit)
                    unit = 1
            else:
                dig = self.CN_NUM.get(cndig)
                if unit:
                    dig *= unit
                    unit = 0
                ldig.append(dig)
        if unit == 10:
            ldig.append(10)
        val, tmp = 0, 0
        for x in reversed(ldig):
            if x == 10000 or x == 100000000:
                val += tmp * x
                tmp = 0
            else:
                tmp += x
        val += tmp
        return val


class ReExtractor:
    def __init__(self):
        self.CHN2NUM = CHN2NUM()
        self._init_reExpression()
        self.log = Logger(self.__class__.__name__,level=ENV.MODEL_LOG_LEVEL.value).logger
        
    def _init_reExpression(self):
        self.num_cn = r'[一二三四五六七八九十零〇两百千0-9]'
        
        self.year_rela_exp = r'(?:今|明|后|后后|再后|下一|再下一|下|再下|(?:{}))年'.format(self.num_cn)
        self.year_fix_exp = r'(?:{}+)年'.format(self.num_cn)
        self.year_exp = r'(?:{}|{})'.format(self.year_rela_exp,self.year_fix_exp)
        
        self.month_rela_exp = r'(?:下下下|下下|再下|下|(?:{}+))(?:个)?月'.format(self.num_cn)
        self.month_fix_exp = r'(?:{}+)月'.format(self.num_cn)
        self.month_exp = r'(?:{}|{})'.format(self.month_rela_exp,self.month_fix_exp)


        self.date_rela_exp = '(?:今|明|后|大后|大大后|再后|(?:{})+)天'.format(self.num_cn)
        self.date_descrip_exp = r'(?:上旬|中旬|下旬|月初|月中|月末|初|中|底|末)'
        self.date_fix_exp = r'(?:(?:{}+)(?:日|号))|{}'.format(self.num_cn,self.date_descrip_exp)
        self.date_exp = r'(?:{}|{})'.format(self.date_rela_exp,self.date_fix_exp)

        
        
        
        
        self.hour_rela_exp= r'(?:{}+(?:个)?(?:半)?)(?:小时|钟头)'.format(self.num_cn)
        self.hour_descrip_exp = r'早晨|凌晨|早上|半夜|中午|下午|傍晚|晚上|清晨|午后'
        self.minute_fix_exp = r'(?:(?:过)?一刻|(?:过)?{}+(?:分)?|半(?:过)?|过)+'.format(self.num_cn)
        self.hour_fix_exp= r'(?:(?:{})|(?:(?:{}+)点(?:{})?))+'.format(self.hour_descrip_exp,
                                                                   self.num_cn,
                                                                     self.minute_fix_exp)
        self.hour_exp = r'(?:{}|{})'.format(self.hour_rela_exp,self.hour_fix_exp)
        
        
        self.minute_rela_exp = r'(?:{}+|几)分钟|一刻钟'.format(self.num_cn)
        self.minute_exp = r'(?:{})'.format(self.minute_rela_exp)


        self.week_rela_exp = r'(?:(?:这|下|再下|下下|再后)(?:{}+)?(?:个)?)|(?:{}+)'.format(self.num_cn,self.num_cn)
        self.weekDay_exp = r'[1-7天一二三四五六七]'
        self.week_unit = r'周|星期|礼拜'
        self.exp_week =  r'(?:(?:{rw})?(?:{wu})+(?:{wd})?(?:{h})?(?:{mm})?)+'.format(rw=self.week_rela_exp,
                                                                           wu=self.week_unit,
                                                                           wd=self.weekDay_exp,
                                                                           h=self.hour_exp,
                                                                            mm=self.minute_exp)

        self.exp_ymd = r'(?:(?:{y})|(?:{m})|(?:{d})|(?:{h})|(?:{mm}))+'.format(y=self.year_exp,
                                                                      m=self.month_exp,
                                                                      d=self.date_exp,
                                                                      h=self.hour_exp,
                                                                        mm=self.minute_exp)
        
    def YearExtractor(self,sentence):
        """
        this function will return a list of year it can extract
        """
        #########
        #######define
        
        def Year2Exp(text):
            """
            text has format of "<x>年"
            output: extracted expression
            """
            year_map = {'今':'?','明':'+1','后':'+2','后后':'+3',
                        '再后':'+3','下一':'+1','下':'+1','再下一':'+2','再下':'+2'}
            def CHN2Year(x):
                ##TODO: more case needs to be handled
                # case 1. digits
                if x.isdigit():
                    # case 1.1 length is 4. eg 2017
                    if len(x) == 4:
                        return x
                    # case 1.2 length is not 4. which may mean calcuate delta
                    else:
                        return '+'+x
                # needs to handle more cases
                else:
                    return '?'
            index = text.find('年')
            y = None
            if index == -1:
                y = '?'
            else:
                context = text[:index]
                # step 1. get mapping from year_map
                if year_map.get(context) is not None:
                    y = year_map.get(context)
                # step 2. CHN to canlender year    
                else:
                    y = CHN2Year(context)
            return y
        ########################## end define ############################
        extracted_list = re.findall(self.year_exp,sentence)
        finds_list = []
        if len(extracted_list) > 0:
            for each in extracted_list:
                try:
                    y = Year2Exp(each)
                except Exception as e:
                    self.log.error(e)
                    y = '?'
                finally:
                    finds_list.append((each,y))
        else:
            y = '?'
            finds_list.append((None,y))
        return finds_list 
    
    def MonthExtractor(self,sentence):
        """
        extract month info and convert to expression
        """
        def Month2Exp(text):
            month_map = {'下下下':'+3','下下':'+2','再下':'+2','下':'+1'}
            def CHN2Month(x,rela=True):
                """
                CHN2Month("二",rela=True)   --> +2
                CHN2Month("二",rela=False)   --> 2
                CHN2Month("二十",rela=False) --> +20
                """
                if x.isdigit():
                    if int(x) > 12 or rela:
                        return '+'+x
                    else:
                        return x
                # if not digit, then transform        
                else:
                    try:
                        x = str(self.CHN2NUM.transform(x))
                        if int(x) > 12 or rela:
                            x = '+'+x                
                    except Exception as e:
                        self.log.error(e)
                        x = '?'
                    finally:
                        return x
            index = text.find('月')
            m = None
            if index == -1:
                m = '?'
            else:
                # step 1. get mapping from month_map
                index_rala = text.find('个月')
                if index_rala != -1:
                    index = index_rala
                content = text[:index]
                if month_map.get(content) is not None:
                    m = month_map.get(content)

                # step 2. CHN to canlender month
                else:
                    if index_rala != -1:
                        ## process relative month logic
                        m = CHN2Month(content,rela=True)
                    else:
                        ## process nonrelative month logic
                        m = CHN2Month(content,rela=False)
            return m
        ################## end define ############
        extracted_list = re.findall(self.month_exp,sentence)
        finds_list = []
        if len(extracted_list) > 0:
            for each in extracted_list:
                try:
                    m = Month2Exp(each)
                except Exception as e:
                    self.log.error(e)
                    m = '?'
                finally:
                    finds_list.append((each,m))
        else:
            m = '?'
            finds_list.append((None,m))
        return finds_list
    
    def DateExtractor(self,sentence,month_flag = True):
        """
        extract month info and convert to expression
        """
        def Date2Exp(text):
            date_map = {'今':'?','明':'+1','后':'+2','大后':'+3','大大后':'+4','再后':'+3',
                        '上旬':'10','中旬':'15','下旬':'28','月初':'10','月中':'15','月末':'28',
                        '初':'10','中':'15','底':'28','末':'28'}
            def CHN2Date(x,rela=True):
                """
                CHN2Date("二",rela=True)   --> +2
                CHN2Date("二",rela=False)   --> 2
                CHN2Date("二十",rela=False) --> +20
                """
                if x.isdigit():
                    if int(x) > 31 or rela:
                        return '+'+x
                    else:
                        return x
                # if not digit, then transform        
                else:
                    try:
                        x = str(self.CHN2NUM.transform(x))
                        if int(x) > 31 or rela:
                            x = '+'+x                
                    except Exception as e:
                        self.log.error(e)
                        x = '?'
                    finally:
                        return x
            # case 1. 号
            if text.find('号') != -1:
                index_date = text.find('号')
                d = CHN2Date(text[:index_date],rela=False)
            # case 2. 天
            elif text.find('天') != -1:
                index_date = text.find('天')
                descrip = text[:index_date]
                d = date_map.get(descrip)
                if d is None:
                    d = CHN2Date(descrip,rela=True)
            # case 3. 天
            elif text.find('日') != -1:
                index_date = text.find('日')
                descrip = text[:index_date]
                if month_flag:
                    d = CHN2Date(text[:index_date],rela=False)
                else:
                    d = CHN2Date(text[:index_date],rela=True)
            else:
                d = '?'
            return d
        ################## end define ############
        extracted_list = re.findall(self.date_exp,sentence)
        finds_list = []
        if len(extracted_list) > 0:
            for each in extracted_list:
                try:
                    d = Date2Exp(each)
                except Exception as e:
                    self.log.error(e)
                    d = '?'
                finally:
                    finds_list.append((each,d))
        else:
            d = '?'
            finds_list.append((None,d))
        return finds_list
    
    

    def HourExtractor(self,sentence,date_flag=None):
        """
        extract hour info and convert to expression
        date_flag will just be used as a placeholder
        3点15，
        3点过一刻
        can be extracted
        """
        def Minute2Exp(text):
            minute_map = {'过':'30','一刻':'15','半':'30','半过':'45'}
            reg_period = r'过|分|钟'
            def CHN2Minute(x,rela=True):
                """
                CHN2Minute("二",rela=True)   --> +2
                CHN2Minute("二",rela=False)   --> 2
                CHN2Minute("二十",rela=False) --> +20
                """
                if x.isdigit():
                    if int(x) > 59 or rela:
                        return '+'+x
                    else:
                        return x
                # if not digit, then transform        
                else:
                    try:
                        x = str(self.CHN2NUM.transform(x))
                        if int(x) > 59 or rela:
                            x = '+'+x                
                    except Exception as e:
                        self.log.error(e)
                        x = '00'
                    finally:
                        if x == '0':
                            x = '00'
                        return x
            # case 1. can get value from defined mapping
            if minute_map.get(text) is not None:
                mm = minute_map.get(text)
            # case 2. need to remove some words
            else:
                text_sub = re.sub(reg_period,'',text)
                if minute_map.get(text_sub) is not None:
                    mm = minute_map.get(text_sub)
                # cannot get value from defined mapping
                else:
                    try:
                        mm = CHN2Minute(text_sub, rela=False)
                    except Exception as e:
                        self.log.error(e)
                        mm = '00'
            return mm

        def Hour2Exp(text):
                hour_map = {'早晨':11,'凌晨':6,'早上':11,'半夜':6,'清晨':11,
                            '中午':14,'下午':18,'傍晚':21,'晚上':23,'午后':15,}
                reg_period = r'(?:个)?(?:小时|钟头)'
                def CHN2Hour(x,rela=True):
                    """
                    CHN2Date("二",rela=True)   --> +2
                    CHN2Date("二",rela=False)   --> 2
                    CHN2Date("二十",rela=False) --> +20
                    """
                    if x.isdigit():
                        if int(x) > 24 or rela:
                            return '+'+x
                        else:
                            return x
                    # if not digit, then transform        
                    else:
                        try:
                            x = str(self.CHN2NUM.transform(x))
                            if int(x) > 24 or rela:
                                x = '+'+x                
                        except Exception as e:
                            self.log.error(e)
                            x = '?'
                        finally:
                            return x
                def ShiftTime(x,am_pm):
                    am_pm = am_pm//12
                    if x.isdigit():
                        x = int(x) 
                        if x < 12 and am_pm == 1:
                            x += 12
                        return str(x)
                    else:
                        return x

                #1. Judge in the morning or afternoon. Then remove the description words
                des_t = re.findall(self.hour_descrip_exp,text)
                if len(des_t) > 0:
                    des_t = des_t[0]
                    hours = hour_map.get(des_t)
                    if hours is None:
                        hours = 17
                    text = re.sub(self.hour_descrip_exp,'',text)
                else:
                    hours = 17
                mm = '00'
                # case 1.  几点

                if len(re.findall(r'点',text)) > 0:
                    index_hour = text.find('点')
                    h = CHN2Hour(text[:index_hour],rela=False)
                    h = ShiftTime(h,hours)
                    mm = Minute2Exp(text[index_hour+1:])
                # case 2. 几个小时
                elif len(re.findall(reg_period,text)) > 0:
                    replace = '!H!'
                    if text.find('半') != -1:
                        mm = '+30'
                        text = re.sub(r'半','',text)
                    finds = re.sub(reg_period,replace,text)
                    index_hour = finds.find(replace)
                    h = CHN2Hour(finds[:index_hour],rela=True)
                else:
                    h = str(hours)
                return h,mm
        ################## end define ############
        extracted_list = re.findall(self.hour_exp,sentence)
        finds_list = []
        if len(extracted_list) > 0:
            for each in extracted_list:
                try:
                    h,mm = Hour2Exp(each)
                except Exception as e:
                    self.log.error(e)
                   
                    h = '?'
                    mm = '?'
                finally:
                    finds_list.append((each,h,mm))
        else:
            h = '?'
            mm = '?'
            finds_list.append((None,h,mm))
        return finds_list
    
    
    def MinuteExtractor(self,sentence,hour_flag=None):
        """
        just extract relative time. fix time will be handled by hour
        """
        def Minute2Exp(text):
            minute_map = {'一刻钟':'+15'}
            def CHN2Minute(x,rela=True):
                """
                CHN2Minute("二",rela=True)   --> +2
                CHN2Minute("二",rela=False)   --> 2
                CHN2Minute("二十",rela=False) --> +20
                """
                if x.isdigit():
                    if int(x) > 59 or rela:
                        return '+'+x
                    else:
                        return x
                # if not digit, then transform        
                else:
                    try:
                        x = str(self.CHN2NUM.transform(x))
                        if int(x) > 59 or rela:
                            x = '+'+x                
                    except Exception as e:
                        self.log.error(e)
                        x = '00'
                    finally:
                        return x
            # case 1. can get value from defined mapping
            if minute_map.get(text) is not None:
                mm = minute_map.get(text)
            # case 2. need to remove some words
            else:
                index_minute = text.find('分钟')
                if index_minute != -1:
                    mm = CHN2Minute(text[:index_minute],rela=True)
                else:
                    mm = '00'
            return mm
        ################## end define ############
        extracted_list = re.findall(self.minute_exp,sentence)
        finds_list = []
        if len(extracted_list) > 0:
            for each in extracted_list:
                try:
                    mm = Minute2Exp(each)
                except Exception as e:
                    self.log.error(e)
                    mm = '00'
                finally:
                    finds_list.append((each,mm))
        else:
            mm = '00'
            finds_list.append((None,mm))
        return finds_list
    
    def WeekExtractor(self,sentence):
        def Week2Exp(text):
            replace_wu = '!wu!'
            reg_wu = r'(?:{})+'.format(replace_wu)
            rela_week_map = {'这':'?','下':'+1','再下':'+2','下下':'+2','再后':'+2'}
            def CHN2Week(x,rela=True):
                """
                CHN2Minute("二",rela=True)   --> +2
                CHN2Minute("二",rela=False)   --> 2
                CHN2Minute("二十",rela=False) --> +20
                """
                if x.isdigit():
                    if rela:
                        return '+'+x
                    else:
                        return x
                # if not digit, then transform        
                else:
                    try:
                        x = str(self.CHN2NUM.transform(x))
                        if rela:
                            x = '+'+x                
                    except Exception as e:
                        self.log.error(e)
                        x = '?'
                    finally:
                        return x
            # replace key word

            text = re.sub(self.week_unit,replace_wu,text)
            text = re.sub(reg_wu,replace_wu,text)
            # no such pattern
            index_wu = text.find(replace_wu)
            if index_wu == -1:
                W = '?'
                w = '?'
            else:
                Week_info = text[:index_wu]
                Week_info = re.sub(r'个','',Week_info)
                WeekDay_info = text[index_wu+len(replace_wu):]
                if len(Week_info) > 0:
                    if rela_week_map.get(Week_info) is not None:
                        W = rela_week_map.get(Week_info)
                    else:
                        W = CHN2Week(Week_info,rela=True)
                else:
                    W = '?'
                if len(WeekDay_info) > 0:
                    w = CHN2Week(WeekDay_info,rela=False)
                else:
                    w = '?'
            return W,w
        ################## end define ############
        extracted_list = re.findall(self.exp_week,sentence)      
        finds_list = []
        if len(extracted_list) > 0:
            for each in extracted_list:
                each = re.sub(self.hour_exp,'',each)
                try:
                    W,w = Week2Exp(each)
                except Exception as e:
                    self.log.error(e)
                    W = '?'
                    w = '?'
                finally:
                    finds_list.append((each,W,w))
        else:
            W = '?'
            w = '?'
            finds_list.append((None,W,w))
        return finds_list
        
    def evl_week(self,sentence):
        exp = '?y-{}W-{}w-{}H:{}M:00S'
        finds = re.findall(self.exp_week,sentence)
        return_list = []

        for each in finds:
            #1. process week and weekday
            extract_w = self.WeekExtractor(each)
            p,W,w = extract_w[0]
            if p is None:
                continue
            #2. process hour and minute
            extract_h = self.HourExtractor(each)
            p,h,mm = extract_h[0]
            if p is None:
                h = '18'
                mm = '00'
            return_list.append((each,exp.format(W,w,h,mm)))
            
        return return_list
                    
    
    
    
        
    
    
        

In [28]:
r = ReExtractor()
r.evl_week('下周下午1点点')

2018-09-07 21:26:02,362 - ERROR - CLASS:ReExtractor- METHOD:evl_week -LINE:591 - MSG:lala1
2018-09-07 21:26:02,362 - ERROR - CLASS:ReExtractor- METHOD:evl_week -LINE:591 - MSG:lala1
2018-09-07 21:26:02,362 - ERROR - CLASS:ReExtractor- METHOD:evl_week -LINE:591 - MSG:lala1
2018-09-07 21:26:02,362 - ERROR - CLASS:ReExtractor- METHOD:evl_week -LINE:591 - MSG:lala1
2018-09-07 21:26:02,362 - ERROR - CLASS:ReExtractor- METHOD:evl_week -LINE:591 - MSG:lala1
2018-09-07 21:26:02,362 - ERROR - CLASS:ReExtractor- METHOD:evl_week -LINE:591 - MSG:lala1
2018-09-07 21:26:02,362 - ERROR - CLASS:ReExtractor- METHOD:evl_week -LINE:591 - MSG:lala1
2018-09-07 21:26:02,362 - ERROR - CLASS:ReExtractor- METHOD:evl_week -LINE:591 - MSG:lala1
2018-09-07 21:26:02,362 - ERROR - CLASS:ReExtractor- METHOD:evl_week -LINE:591 - MSG:lala1
2018-09-07 21:26:02,362 - ERROR - CLASS:ReExtractor- METHOD:evl_week -LINE:591 - MSG:lala1
2018-09-07 21:26:02,362 - ERROR - CLASS:ReExtractor- METHOD:evl_week -LINE:591 - MSG:lala1

[('下周', '+1', '?')]


[('下周下午1点', '?y-+1W-?w-13H:00M:00S')]

In [679]:
r = ReExtractor()
r.WeekExtractor('我下个周二下午')

[('下个周二', '+1', '2')]

In [650]:
r = ReExtractor()
r.MinuteExtractor('再给我15分钟')

[('15分钟', '+15')]

In [612]:
r.WeekExtractor('下')

[(None, '?', '?')]

In [338]:

r.YearExtractor('2后')

[(None, '?')]

In [8]:
%%timeit
r.MonthExtractor('2年2个月以后')

3 µs ± 65.6 ns per loop (mean ± std. dev. of 7 runs, 100000 loops each)


In [103]:
%%timeit
r.DateExtractor('2月14号以后')

4.22 µs ± 531 ns per loop (mean ± std. dev. of 7 runs, 100000 loops each)


In [337]:
%%timeit
r.HourExtractor('2月14号下午4点')

7.37 µs ± 122 ns per loop (mean ± std. dev. of 7 runs, 100000 loops each)


[('24小时', '?', '?')]