In [141]:
import pandas as pd
import numpy as np
import re
from parsi_io.modules.number_extractor import NumberExtractor

In [142]:
class MeasurementExtractor:
    extractor = NumberExtractor()
    #loading units data and replacing "NaN" values with 0
    units_dataframe = pd.read_csv("Units.csv",header=None)
    units_dataframe = units_dataframe.replace(np.nan, 0)


    # loading pre-unit words which are used in "pre-unit word + [decimal frachtion] + unit" pattern.
    preunits_dataframe = pd.read_csv("PreUnitWords.csv",header=None)
    preunits_dataframe = preunits_dataframe.transpose()


    # loading pre-unit words which are used in "pre-unit word + [decimal frachtion] + unit" pattern.
    decimal_fractions_dataframe = pd.read_csv("DecimalFractions.csv",header=None)
    decimal_fractions_dataframe = decimal_fractions_dataframe.transpose()

    decimal_fractions_list = decimal_fractions_dataframe.values.tolist()
    decimal_fractions_joined = "|".join(decimal_fractions_list[0])    

    adjectives_dataframe = pd.read_csv("Adjectives.csv",header=0)
    pattern_3_adjectives = []
    pattern_4_adjectives = []
    pattern_4_adjective_to_type = {}
    adjectives_dataframe = adjectives_dataframe.reset_index()  # make sure indexes pair with number of rows
    for index, row in adjectives_dataframe.iterrows():
        if row['valid for pattern 3'] == 1:
            pattern_3_adjectives+= [row['adjective']]
        if row['valid for pattern 4'] == 1:
            pattern_4_adjectives+= [row['adjective']]
            pattern_4_adjective_to_type[row['adjective']]= row['pattern 4 type']        
    pattern_3_adjectives_joined = "|".join(pattern_3_adjectives)
    pattern_4_adjectives_joined = "|".join(pattern_4_adjectives)

    keywords_dataframe = pd.read_csv("Keywords.csv",header=0)
    
    all_keywords = []
    pattern_1_keywords = []
    pattern_1_keyword_to_type = {}
    keywords_dataframe = keywords_dataframe.reset_index()  # make sure indexes pair with number of rows
    for index, row in keywords_dataframe.iterrows():
        all_keywords+= [row['keyword']]
        pattern_1_keywords+= [row['keyword']]
        pattern_1_keyword_to_type[row['keyword']]= row['type']
    pattern_1_keywords_joined = "|".join(pattern_1_keywords)
    all_keywords_joined = "|".join(all_keywords)

    #creating a dictionary of units which key corresponds quantity name and value corresponds list of units related to that quantity
    units_dict = {}
    for index, row in units_dataframe.iterrows():
        qunantity_name = row[0]
        quantity_units = row[1:].tolist()
        #removing 0 values
        quantity_units = list(filter(lambda a: a != 0, quantity_units))
        units_dict [qunantity_name] = quantity_units


    #joining all units sorted by length in descending order
    all_units = []
    for key, value in units_dict.items():
        all_units += value
    sorted_units = sorted(all_units, key=len, reverse = True)
    sorted_units_joined = "|".join(sorted_units)
    
    preunits_list = preunits_dataframe.values.tolist()
    preunits_joined = "|".join(preunits_list[0])

    # We don't need this function anymore.
    # a function that joins all units with or ("|")
    units_joined = ""
    for key, value in units_dict.items():
        units_joined += "|".join(value)
        units_joined += "|"
    units_joined = units_joined[:-1]

    #a dictionary to map qunatity's name from english to farsi
    quantity_name_translator = {
        "length": "طول",
        "mass": "وزن",
        "pressure": "فشار",
        "volume": "حجم",
        "temperature": "دما",
        "area":"مساحت",
        "speed":"سرعت",
        "force":"نیرو",
        "energy":"انرژی",
        "power":"توان",
        "torque":"گشتاور",
        "time":"زمان",
        "density":"چگالی",
        "frequency":"فرکانس",
        "degree":"زاویه",
        "acceleration":"شتاب",
        "debi":"شارش جرمی",
        "debi-v":"شارش حجمی",
        "data-storage":"ذخیره دیجیتال",
        "data-transfer":"انتقال داده"    
    }


    # a function that gets the unit and returns corresponding quantity type in persian
    def get_quantity_type(unit):
        for key, value in MeasurementExtractor.units_dict.items():
            if unit in value:
                return MeasurementExtractor.quantity_name_translator[key]
        return 0

    def run(input_str):
        output = match_keyword_adjective_pattern(input_str)
        output += match_keyword_amount_pattern (input_str)
        output += match_sole_adjecive_pattern (input_str)
        output += match_amount_unit_pattern (input_str)
        output += match_preunit_decimal_unit_pattern (input_str)
        return output

In [143]:
def match_keyword_adjective_pattern (input_str):
    all_matches = re.findall(f'({MeasurementExtractor.all_keywords_joined})+\s*({MeasurementExtractor.pattern_3_adjectives_joined})+',input_str)
    
    i = 0
    output = [{} for sub in range(len(all_matches))]
    for match in re.finditer(f'({MeasurementExtractor.all_keywords_joined})+\s*({MeasurementExtractor.pattern_3_adjectives_joined})+',input_str):
        print(match)
        output[i]['unit'] = ""
        output[i]['amount'] = ""
        output[i]['marker'] = match.group()
        output[i]['span'] = match.span()
        i += 1
        
    return output

In [144]:
input = "رودخانه‌ای با عرض کم دیدم"
print(match_keyword_adjective_pattern(input))

[]


In [145]:
def match_keyword_amount_pattern (input_str):
    values = MeasurementExtractor.extractor.run(input_str)
    phrase_amount_dict = {}
    for value in values:
        phrase_amount_dict[value['phrase']] = value['value'] 
    phrases_joined = "|".join(phrase_amount_dict.keys())
    
    all_matches = re.findall(f'({MeasurementExtractor.pattern_1_keywords_joined})+\s*({phrases_joined})+',input_str)
    
    i = 0
    output = [{} for sub in range(len(all_matches))]
    for match in re.finditer(f'({MeasurementExtractor.pattern_1_keywords_joined})+\s*({phrases_joined})+',input_str):
        output[i]['unit'] = ""
        output[i]['amount'] = phrase_amount_dict[all_matches[i][1]]
        output[i]['marker'] = match.group()
        output[i]['span'] = match.span()
        i += 1
        
    return output

In [146]:
input = "خانه اتاقی به ضلع 4 و حیاطی با طول 12 دارد"
print(match_keyword_amount_pattern(input))

[{'unit': '', 'amount': 4.0, 'marker': 'ضلع 4', 'span': (14, 19)}, {'unit': '', 'amount': 12.0, 'marker': 'طول 12', 'span': (31, 37)}]


In [147]:
def match_sole_adjecive_pattern (input_str):
    all_matches = re.findall(f'({MeasurementExtractor.pattern_4_adjectives_joined})',input_str)
    
    i = 0
    output = [{} for sub in range(len(all_matches))]
    for match in re.finditer(f'({MeasurementExtractor.pattern_4_adjectives_joined})',input_str):
        output[i]['unit'] = ""
        output[i]['amount'] = ""
        output[i]['marker'] = match.group()
        output[i]['span'] = match.span()
        i += 1
        
    return output

In [148]:
input = "سریع حرکت کرد"
print(match_sole_adjecive_pattern(input))

[{'unit': '', 'amount': '', 'marker': 'سریع', 'span': (0, 4)}]


In [149]:
"""
A function that detects quantities which follow "amount + unit" pattern.
"""

def match_amount_unit_pattern (input_str):
    values = MeasurementExtractor.extractor.run(input_str)
    phrase_amount_dict = {}
    for value in values:
        phrase_amount_dict[value['phrase']] = value['value'] 
    phrases_joined = "|".join(phrase_amount_dict.keys())
        
    all_matches = re.findall(f'({phrases_joined})+\s*({MeasurementExtractor.sorted_units_joined})+',input_str)
    i = 0
    output = [{} for sub in range(len(all_matches))]
    for match in re.finditer(f'({phrases_joined})+\s*({MeasurementExtractor.sorted_units_joined})+',input_str):
        output[i]['type'] = MeasurementExtractor.get_quantity_type(all_matches[i][1])
        output[i]['amount'] = phrase_amount_dict[all_matches[i][0]]
        output[i]['unit'] = all_matches[i][1]
        output[i]['marker'] = match.group()
        output[i]['span'] = match.span()
        i += 1
        
    return output

In [150]:
# amount + unit examples:
input = "علی سه متر پارچه و دو کیلوگرم آرد خرید و یک ساعت صبر کرد."
print(match_amount_unit_pattern(input))
input = "2 Gb"
print(match_amount_unit_pattern(input))

[{'type': 'طول', 'amount': 3, 'unit': 'متر', 'marker': 'سه متر', 'span': (4, 10)}, {'type': 'وزن', 'amount': 2, 'unit': 'کیلوگرم', 'marker': 'دو کیلوگرم', 'span': (19, 29)}, {'type': 'زمان', 'amount': 1, 'unit': 'ساعت', 'marker': 'یک ساعت', 'span': (41, 48)}]
[{'type': 'ذخیره دیجیتال', 'amount': 2.0, 'unit': 'Gb', 'marker': '2 Gb', 'span': (0, 4)}]


In [151]:
"""
A function that detects quantities which follow "pre-unit words such as "چند" + 
                                                [decimal fractions (ده-صد-هزار و ...)] + 
                                                unit" pattern.
"""
def match_preunit_decimal_unit_pattern(input_str):
    
    all_matches = re.findall(f'({MeasurementExtractor.preunits_joined})+\s*({MeasurementExtractor.decimal_fractions_joined})?\s*({MeasurementExtractor.units_joined})+',input_str)
    i = 0
    output = [{} for sub in range(len(all_matches))]
    for match in re.finditer(f'({MeasurementExtractor.preunits_joined})+\s*({MeasurementExtractor.decimal_fractions_joined})?\s*({MeasurementExtractor.units_joined})+',input_str):
        output[i]['type'] = MeasurementExtractor.get_quantity_type(all_matches[i][2])
        output[i]['amount'] = ''
        output[i]['unit'] = all_matches[i][2]
        output[i]['marker'] = match.group()
        output[i]['span'] = match.span()
        i += 1
        
    return output

In [152]:
# pre-unit word + [decimal fraction] + unit examples:
input = "چند صد هزار تن گوشت وارداتی"
print(match_preunit_decimal_unit_pattern(input))

[{'type': 'وزن', 'amount': '', 'unit': 'تن', 'marker': 'چند صد هزار تن', 'span': (0, 14)}]


In [153]:
#bug solved:
print(match_amount_unit_pattern("دو فوت بر ثانیه"))

[{'type': 'سرعت', 'amount': 2, 'unit': 'فوت بر ثانیه', 'marker': 'دو فوت بر ثانیه', 'span': (0, 15)}]


In [154]:
input = "چند صد هزار تن گوشت وارداتی خرید و از رودخانه‌ای به عرض کم عبور داد تا 3 کلیگرم آرد بخرد"
print(MeasurementExtractor.run(input))

[{'type': 'وزن', 'amount': 100000, 'unit': 'تن', 'marker': 'صد هزار تن', 'span': (4, 14)}, {'type': 'وزن', 'amount': '', 'unit': 'تن', 'marker': 'چند صد هزار تن', 'span': (0, 14)}]
