In [342]:
from __future__ import unicode_literals
import pandas as pd
import numpy as np
import re
from parsi_io.modules.number_extractor import NumberExtractor
import json
from hazm import *

In [361]:
class MeasurementExtractor:
    extractor = NumberExtractor()
    
    #loading translator data which maps quantity's name from english to persian
    translator_dataframe = pd.read_csv("QuantityNameTranslator.csv") 


    #loading units data and replacing "NaN" values with 0
    units_dataframe = pd.read_csv("Units.csv",header=None)
    units_dataframe = units_dataframe.replace(np.nan, 0)


    # loading pre-unit words which are used in "pre-unit word + [decimal frachtion] + unit" pattern.
    preunits_dataframe = pd.read_csv("PreUnitWords.csv",header=None)
    preunits_dataframe = preunits_dataframe.transpose()


    # loading pre-unit words which are used in "pre-unit word + [decimal frachtion] + unit" pattern.
    decimal_fractions_dataframe = pd.read_csv("DecimalFractions.csv",header=None)
    decimal_fractions_dataframe = decimal_fractions_dataframe.transpose()

    decimal_fractions_list = decimal_fractions_dataframe.values.tolist()
    decimal_fractions_joined = "|".join(decimal_fractions_list[0])    

    adjectives_dataframe = pd.read_csv("Adjectives.csv",header=0)
    pattern_3_adjectives = []
    pattern_4_adjectives = []
    pattern_4_adjective_to_type = {}
    adjectives_dataframe = adjectives_dataframe.reset_index()  # make sure indexes pair with number of rows
    for index, row in adjectives_dataframe.iterrows():
        if row['valid for pattern 3'] == 1:
            pattern_3_adjectives+= [row['adjective']]
        if row['valid for pattern 4'] == 1:
            pattern_4_adjectives+= [row['adjective']]
            pattern_4_adjective_to_type[row['adjective']]= row['pattern 4 type']        
    pattern_3_adjectives_joined = "|".join(pattern_3_adjectives)
    pattern_4_adjectives_joined = "|".join(pattern_4_adjectives)

    keywords_dataframe = pd.read_csv("Keywords.csv",header=0)
    
    all_keywords = []
    pattern_1_keywords = []
    pattern_1_keyword_to_type = {}
    keywords_dataframe = keywords_dataframe.reset_index()  # make sure indexes pair with number of rows
    for index, row in keywords_dataframe.iterrows():
        all_keywords+= [row['keyword']]
        pattern_1_keywords+= [row['keyword']]
        pattern_1_keyword_to_type[row['keyword']]= row['type']
    pattern_1_keywords_joined = "|".join(pattern_1_keywords)
    all_keywords_joined = "|".join(all_keywords)

    #creating a dictionary of units which key corresponds quantity name and value corresponds list of units related to that quantity
    units_dict = {}
    for index, row in units_dataframe.iterrows():
        qunantity_name = row[0]
        quantity_units = row[1:].tolist()
        #removing 0 values
        quantity_units = list(filter(lambda a: a != 0, quantity_units))
        units_dict [qunantity_name] = quantity_units


    #joining all units sorted by length in descending order
    all_units = []
    for key, value in units_dict.items():
        all_units += value
    sorted_units = sorted(all_units, key=len, reverse = True)
    sorted_units_joined = "|".join(sorted_units)
    
    preunits_list = preunits_dataframe.values.tolist()
    preunits_joined = "|".join(preunits_list[0])

    # We don't need this function anymore.
    # a function that joins all units with or ("|")
    units_joined = ""
    for key, value in units_dict.items():
        units_joined += "|".join(value)
        units_joined += "|"
    units_joined = units_joined[:-1]

    #a dictionary to map qunatity's name from english to farsi
    quantity_name_translator = {}
    for index, row in translator_dataframe.iterrows():
        quantity_name_translator[row["english"]] = row["persian"]
    
    def extract_noun_phrases(input_str):
        tagger = POSTagger(model='resources/postagger.model')
        chunker = Chunker(model='resources/chunker.model')
        tagged = tagger.tag(word_tokenize(input_str))
        phrases = tree2brackets(chunker.parse(tagged))
        all_matches = re.findall('\[.+?\]',phrases)
        noun_phrases = []
        pharases_span = []
        for m in all_matches:
            t = ()
            np = re.search('\[.*NP\]',m)
            if np != None:
                np_string = np.group()[1:-3]
                np_string = np_string.strip()
                noun_phrases.append(np_string)
                t += (input_str.find(np_string), input_str.find(np_string)+len(np_string))
                pharases_span.append(t)
        return noun_phrases, pharases_span


    # a function that gets the unit and returns corresponding quantity type in persian
    def get_quantity_type(unit):
        for key, value in MeasurementExtractor.units_dict.items():
            if unit in value:
                return MeasurementExtractor.quantity_name_translator[key]
        return 0
    
    def print_clean(output_list):
        for output in output_list:
            print(json.dumps(output, indent=4, ensure_ascii = False))
            print("#"*50)
    
    def add_point(input_str):
        return input_str+"."

    def run(input_str):
        output = match_keyword_adjective_pattern(input_str)
        output += match_amount_unit_pattern (input_str)
        output += match_keyword_amount_pattern (input_str)
        output += match_sole_adjecive_pattern (input_str)
        output += match_preunit_decimal_unit_pattern (input_str)
        
        return output

In [344]:
def match_keyword_adjective_pattern (input_str):
    all_matches = re.findall(f'({MeasurementExtractor.all_keywords_joined})+\s*({MeasurementExtractor.pattern_3_adjectives_joined})+',input_str)
    
    i = 0
    output = [{} for sub in range(len(all_matches))]
    for match in re.finditer(f'({MeasurementExtractor.all_keywords_joined})+\s*({MeasurementExtractor.pattern_3_adjectives_joined})+',input_str):
        #print(match)
        output[i]['unit'] = ""
        output[i]['amount'] = ""
        output[i]['marker'] = match.group()
        output[i]['span'] = match.span()
        i += 1
        
    return output

In [345]:
input = "رودخانه‌ای با عرض کم دیدم"
MeasurementExtractor.print_clean(match_keyword_adjective_pattern(input))

{
    "unit": "",
    "amount": "",
    "marker": "عرض کم",
    "span": [
        14,
        20
    ]
}
##################################################


In [346]:
def match_keyword_amount_pattern (input_str):
    values = MeasurementExtractor.extractor.run(input_str)
    phrase_amount_dict = {}
    for value in values:
        phrase_amount_dict[value['phrase']] = value['value'] 
    phrases_joined = "|".join(phrase_amount_dict.keys())
    
    all_matches = re.findall(f'({MeasurementExtractor.pattern_1_keywords_joined})+\s*({phrases_joined})+',input_str)
    
    i = 0
    output = [{} for sub in range(len(all_matches))]
    for match in re.finditer(f'({MeasurementExtractor.pattern_1_keywords_joined})+\s*({phrases_joined})+',input_str):
        output[i]['unit'] = ""
        output[i]['amount'] = phrase_amount_dict[all_matches[i][1]]
        output[i]['marker'] = match.group()
        output[i]['span'] = match.span()
        i += 1
        
    return output

In [347]:
input = "خانه اتاقی به ضلع 4 و حیاطی با طول 12 دارد"
MeasurementExtractor.print_clean(match_keyword_amount_pattern(input))

{
    "unit": "",
    "amount": 4.0,
    "marker": "ضلع 4",
    "span": [
        14,
        19
    ]
}
##################################################
{
    "unit": "",
    "amount": 12.0,
    "marker": "طول 12",
    "span": [
        31,
        37
    ]
}
##################################################


In [348]:
def match_sole_adjecive_pattern (input_str):
    all_matches = re.findall(f'({MeasurementExtractor.pattern_4_adjectives_joined})',input_str)
    
    i = 0
    output = [{} for sub in range(len(all_matches))]
    for match in re.finditer(f'({MeasurementExtractor.pattern_4_adjectives_joined})',input_str):
        output[i]['unit'] = ""
        output[i]['amount'] = ""
        output[i]['marker'] = match.group()
        output[i]['span'] = match.span()
        i += 1
        
    return output

In [349]:
input = "سریع حرکت کرد"
MeasurementExtractor.print_clean(match_sole_adjecive_pattern(input))

{
    "unit": "",
    "amount": "",
    "marker": "سریع",
    "span": [
        0,
        4
    ]
}
##################################################


In [350]:
"""
A function that detects quantities which follow "amount + unit" pattern.
"""

def match_amount_unit_pattern (input_str):
    nphrases,np_s = MeasurementExtractor.extract_noun_phrases(input_str)
    values = MeasurementExtractor.extractor.run(input_str)
    phrase_amount_dict = {}
    for value in values:
        phrase_amount_dict[value['phrase']] = value['value'] 
    phrases_joined = "|".join(phrase_amount_dict.keys())
        
    all_matches = re.findall(f'({phrases_joined})+\s*({MeasurementExtractor.sorted_units_joined})+',input_str)
    i = 0
    output = [{} for sub in range(len(all_matches))]
    for match in re.finditer(f'({phrases_joined})+\s*({MeasurementExtractor.sorted_units_joined})+',input_str):
        #comment to be added :)) 
        marker = match.group()
        j=0
        span = match.span()
        for s in np_s: 
            if s[0]<=match.span()[0] and s[1]>=match.span()[1]:
                start = match.span()[0]
                end = s[1]
                marker = input_str[start:end]
                span = (start, end)
            j+=1
        output[i]['type'] = MeasurementExtractor.get_quantity_type(all_matches[i][1])
        output[i]['item'] = marker.replace(match.group(),'')
        output[i]['amount'] = phrase_amount_dict[all_matches[i][0]]
        output[i]['unit'] = all_matches[i][1]
        output[i]['marker'] = marker
        output[i]['span'] = span
        i += 1
        
    return output

In [351]:
#amount + unit examples:
input = "دو کیلوگرم آرد خرید . یک ساعت صبر کرد."
MeasurementExtractor.print_clean(match_amount_unit_pattern(input))
input = "۲ گیگا بایت اینترنت رایگان گرفت"
MeasurementExtractor.print_clean(match_amount_unit_pattern(input))
input = " یک ماشین به وزن 3 تن محمد را زیر گرفت"
MeasurementExtractor.print_clean(match_amount_unit_pattern(input))
input = "سه متر پارچه و دو کیلوگرم آرد خرید ."
MeasurementExtractor.print_clean(match_amount_unit_pattern(input))


{
    "type": "وزن",
    "item": " آرد",
    "amount": 2,
    "unit": "کیلوگرم",
    "marker": "دو کیلوگرم آرد",
    "span": [
        0,
        14
    ]
}
##################################################
{
    "type": "زمان",
    "item": "",
    "amount": 1,
    "unit": "ساعت",
    "marker": "یک ساعت",
    "span": [
        22,
        29
    ]
}
##################################################
{
    "type": "ذخیره دیجیتال",
    "item": " اینترنت",
    "amount": 2.0,
    "unit": "گیگا بایت",
    "marker": "۲ گیگا بایت اینترنت",
    "span": [
        0,
        19
    ]
}
##################################################
{
    "type": "وزن",
    "item": " محمد",
    "amount": 3.0,
    "unit": "تن",
    "marker": "3 تن محمد",
    "span": [
        17,
        26
    ]
}
##################################################
{
    "type": "طول",
    "item": " پارچه",
    "amount": 3,
    "unit": "متر",
    "marker": "سه متر پارچه",
    "span": [
        0,
        12
    ]
}
##########

In [352]:
"""
A function that detects quantities which follow "pre-unit words such as "چند" + 
                                                [decimal fractions (ده-صد-هزار و ...)] + 
                                                unit" pattern.
"""
def match_preunit_decimal_unit_pattern(input_str):
    nphrases,np_s = MeasurementExtractor.extract_noun_phrases(input_str)

    all_matches = re.findall(f'({MeasurementExtractor.preunits_joined})+\s*({MeasurementExtractor.decimal_fractions_joined})?\s*({MeasurementExtractor.sorted_units_joined})+',input_str)
    i = 0
    output = [{} for sub in range(len(all_matches))]
    for match in re.finditer(f'({MeasurementExtractor.preunits_joined})+\s*({MeasurementExtractor.decimal_fractions_joined})?\s*({MeasurementExtractor.sorted_units_joined})+',input_str):
        marker = match.group()
        j=0
        span = match.span()
        for s in np_s: 
            if s[0]<=match.span()[0] and s[1]>=match.span()[1]:
                start = match.span()[0]
                end = s[1]
                marker = input_str[start:end]
                span = (start, end)
            j+=1
        
        output[i]['type'] = MeasurementExtractor.get_quantity_type(all_matches[i][2])
        output[i]['amount'] = ''
        output[i]['unit'] = all_matches[i][2]
        output[i]['item'] = marker.replace(match.group(),'')
        output[i]['marker'] = marker
        output[i]['span'] = span
        i += 1
        
    return output


In [353]:
input = "چند صد هزار تن گوشت وارداتی آورد و از رودخانه‌ای به عرض کم عبور داد تا 3 کیلوگرم گندم بخرد"

MeasurementExtractor.print_clean(match_preunit_decimal_unit_pattern(input))

{
    "type": "وزن",
    "amount": "",
    "unit": "تن",
    "item": " گوشت",
    "marker": "چند صد هزار تن گوشت",
    "span": [
        0,
        19
    ]
}
##################################################


In [313]:
# pre-unit word + [decimal fraction] + unit examples:
input = "چند صد هزار تن گوشت وارداتی"
MeasurementExtractor.print_clean(match_amount_unit_pattern(input))
input = "چند صد هزار تن گوشت وارداتی آورد و از رودخانه‌ای به عرض کم عبور داد تا 3 کیلوگرم گندم بخرد"
MeasurementExtractor.print_clean(match_amount_unit_pattern(input))

{
    "type": "وزن",
    "item": " گوشت وارداتی",
    "amount": 100000,
    "unit": "تن",
    "marker": "صد هزار تن گوشت وارداتی",
    "span": [
        4,
        27
    ]
}
##################################################
{
    "type": "وزن",
    "item": " گوشت",
    "amount": 100000,
    "unit": "تن",
    "marker": "صد هزار تن گوشت",
    "span": [
        4,
        19
    ]
}
##################################################
{
    "type": "وزن",
    "item": " گندم",
    "amount": 3.0,
    "unit": "کیلوگرم",
    "marker": "3 کیلوگرم گندم",
    "span": [
        71,
        85
    ]
}
##################################################


In [314]:
#bug solved:
MeasurementExtractor.print_clean(match_amount_unit_pattern("دو فوت بر ثانیه"))

{
    "type": "سرعت",
    "item": "",
    "amount": 2,
    "unit": "فوت بر ثانیه",
    "marker": "دو فوت بر ثانیه",
    "span": [
        0,
        15
    ]
}
##################################################


In [315]:
input = "چند صد هزار تن گوشت وارداتی خرید و از رودخانه‌ای به عرض کم عبور داد تا 3 کیلوگرم گندم بخرد"
#print(MeasurementExtractor.run(input))
MeasurementExtractor.print_clean(MeasurementExtractor.run(input))

{
    "unit": "",
    "amount": "",
    "marker": "عرض کم",
    "span": [
        52,
        58
    ]
}
##################################################
{
    "type": "وزن",
    "item": " گوشت وارداتی",
    "amount": 100000,
    "unit": "تن",
    "marker": "صد هزار تن گوشت وارداتی",
    "span": [
        4,
        27
    ]
}
##################################################
{
    "type": "وزن",
    "item": " گندم",
    "amount": 3.0,
    "unit": "کیلوگرم",
    "marker": "3 کیلوگرم گندم",
    "span": [
        71,
        85
    ]
}
##################################################
{
    "type": "وزن",
    "amount": "",
    "unit": "تن",
    "item": " گوشت وارداتی",
    "marker": "چند صد هزار تن گوشت وارداتی",
    "span": [
        0,
        27
    ]
}
##################################################


In [316]:
# amount + unit examples:
input1 = "دو کیلوگرم آرد خرید . یک ساعت صبر کرد."
MeasurementExtractor.print_clean(match_amount_unit_pattern(input1))

input2 = "۲ گیگا بایت اینترنت رایگان گرفت"
MeasurementExtractor.print_clean(match_amount_unit_pattern(input2))

input4 = "باتری خود را هشتاد و پنج صدم وات شارژ کرد."
MeasurementExtractor.print_clean(match_amount_unit_pattern(input4))

input5 = "علی سه کیلوگرم آرد را خرید"
MeasurementExtractor.print_clean(match_amount_unit_pattern(input5))

input6 = "سه مثقال طلا خرید و فروخت."
MeasurementExtractor.print_clean(match_amount_unit_pattern(input6))


# test cases:
t1 = "علی ۳.۵ کیلوگرم آرد خرید و باتری خود را هشتاد و پنج صدم وات شارژ کرد."
MeasurementExtractor.print_clean(match_amount_unit_pattern(t1))

t2 = "شهاب سنگی به تندی ۱۵ km/s وارد جو زمین شد."
MeasurementExtractor.print_clean(match_amount_unit_pattern(t2))

# t3 = "یک خودرو با سرعت زیاد از ما سبقت گرفت"


# buggy examples:
# in b1, it doesn't get نان as an item.
b1 = 'سه کیلوگرم نان را خورد و باتری خود را هشتاد و پنج صدم وات شارژ کرد.'
MeasurementExtractor.print_clean(match_amount_unit_pattern(b1))

b2 = "علی سه کیلوگرم آرد خرید."
MeasurementExtractor.print_clean(match_amount_unit_pattern(b2))


# pre-unit word + [decimal fraction] + unit examples:
input3 = "چند صد هزار تن گوشت وارداتی"
MeasurementExtractor.print_clean(match_preunit_decimal_unit_pattern(input3))

{
    "type": "وزن",
    "item": " آرد",
    "amount": 2,
    "unit": "کیلوگرم",
    "marker": "دو کیلوگرم آرد",
    "span": [
        0,
        14
    ]
}
##################################################
{
    "type": "زمان",
    "item": "",
    "amount": 1,
    "unit": "ساعت",
    "marker": "یک ساعت",
    "span": [
        22,
        29
    ]
}
##################################################
{
    "type": "ذخیره دیجیتال",
    "item": " اینترنت",
    "amount": 2.0,
    "unit": "گیگا بایت",
    "marker": "۲ گیگا بایت اینترنت",
    "span": [
        0,
        19
    ]
}
##################################################
{
    "type": "توان",
    "item": "",
    "amount": 80.05,
    "unit": "وات",
    "marker": "هشتاد و پنج صدم وات",
    "span": [
        13,
        32
    ]
}
##################################################
{
    "type": "وزن",
    "item": " آرد",
    "amount": 3,
    "unit": "کیلوگرم",
    "marker": "سه کیلوگرم آرد",
    "span": [
        4,
        18
    ]

In [326]:

#loading test cases
tests_dataframe = pd.read_csv("Tests.csv")
tests_dataframe = tests_dataframe.replace(np.nan, 0)


In [362]:
def test():
    index = 0
    for row in tests_dataframe["Texts"]:
        print("*"*50 + f' test {index} '+ "*"*50)
        output = MeasurementExtractor.run(MeasurementExtractor.add_point(row))
        MeasurementExtractor.print_clean(output)
        index +=1

In [363]:
test()

************************************************** test 0 **************************************************
{
    "type": "وزن",
    "item": "",
    "amount": 3.0,
    "unit": "کیلوگرم",
    "marker": "3 کیلوگرم",
    "span": [
        15,
        24
    ]
}
##################################################
{
    "unit": "",
    "amount": 3.0,
    "marker": "وزن 3",
    "span": [
        11,
        16
    ]
}
##################################################
************************************************** test 1 **************************************************


KeyError: ''