# Requirements

In [None]:
HOME = '/content/drive/MyDrive/PAT_code_to_share'

In [None]:
import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)
from google.colab import drive
drive.mount('/content/drive')

!pip install hazm
!pip install num2fawords

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting hazm
  Downloading hazm-0.7.0-py3-none-any.whl (316 kB)
[K     |████████████████████████████████| 316 kB 5.2 MB/s 
[?25hCollecting libwapiti>=0.2.1
  Downloading libwapiti-0.2.1.tar.gz (233 kB)
[K     |████████████████████████████████| 233 kB 61.3 MB/s 
[?25hCollecting nltk==3.3
  Downloading nltk-3.3.0.zip (1.4 MB)
[K     |████████████████████████████████| 1.4 MB 20.1 MB/s 
Building wheels for collected packages: nltk, libwapiti
  Building wheel for nltk (setup.py) ... [?25l[?25hdone
  Created wheel for nltk: filename=nltk-3.3-py3-none-any.whl size=1394486 sha256=1351d8e1290f9de86ce676dc144ec8ed299181b7357de921e4f06a6eae066abc
  Stored in directory: /root/.cache/pip/wheels/19/1d/3a/0a8c14c30132b4f9ffd796efbb6746f15b3d6bcfc1055a9346
  Building wheel for libwapiti (setup.py) ... [?25l[?25hdone
  Created wheel for libwapiti: filename=libwapiti-0.2.1-cp38-cp38-linux_x86_6

In [None]:
!cp -r "$HOME/Library/" .
!cp -r "$HOME/Data/urban_hierarchy.json" .
!unzip "$HOME/Data/RawDataZips/mrud.zip"
!unzip "$HOME/Data/RawDataZips/dehKhoda.zip"
!unzip "$HOME/Data/RawDataZips/sina_post_compact_addresses.zip"
!unzip "$HOME/Data/RawDataZips/sina_post_address_details.zip"

Archive:  /content/drive/MyDrive/PAT_code_to_share/Data/RawDataZips/mrud.zip
  inflating: mrud.csv                
Archive:  /content/drive/MyDrive/PAT_code_to_share/Data/RawDataZips/dehKhoda.zip
  inflating: dehKhoda.csv            
Archive:  /content/drive/MyDrive/PAT_code_to_share/Data/RawDataZips/sina_post_compact_addresses.zip
  inflating: sina_post_compact_addresses.csv  
Archive:  /content/drive/MyDrive/PAT_code_to_share/Data/RawDataZips/sina_post_address_details.zip
  inflating: sina_post_address_details.csv  


In [None]:
import re
import json
import numpy as np
import pandas as pd
from Library.Utils import *
from Library.PAT import PAT
from tqdm.notebook import tqdm
from Library.NBTree import NB3
from Library.NaiveBayes import NBLM,PNBLM
from Library.Preprocessor import Preprocessor
from Library.ApartementDetailExtractor import ADE
from Library.AccurateRestorer import AccurateRestorer

# Preprocessor

## Train

In [None]:
dehKhoda = pd.read_csv('dehKhoda.csv', header=None)
dehKhoda = pd.DataFrame({'word': np.unique(dehKhoda[0])})
dehKhoda = dehKhoda[~dehKhoda['word'].apply(lambda x: '…' in x)]
dehKhoda = dehKhoda[dehKhoda['word'].apply(len) > 2]
dehKhoda = dehKhoda[dehKhoda['word'].apply(lambda x: x.count(' ')) == 0]
dehKhoda = dehKhoda[dehKhoda['word'].apply(lambda x: '-' not in x)]
dehKhoda_short_words = pd.read_csv('./Library/ValidShortWords.csv')
suffix = dehKhoda_short_words[dehKhoda_short_words.suffix].word.values.tolist()
prefix = dehKhoda_short_words[dehKhoda_short_words.prefix].word.values.tolist()
dehKhoda_short_words = dehKhoda_short_words[['word']]
dehKhoda = dehKhoda.append(dehKhoda_short_words, ignore_index=True)
dehKhoda = dehKhoda.drop_duplicates(subset=['word'])

mrud = pd.read_csv('mrud.csv',usecols=['address','parcel'])
mrud = mrud[~mrud.parcel.isna()]
mrud = mrud[~mrud.address.isna()]
mrud = mrud[~mrud.address.apply(lambda x: 'کژی' in x or '«یٍ«' in x)]
sina_post = pd.read_csv('sina_post_compact_addresses.csv')
sina_post = sina_post[~sina_post.address.str.isdigit()]
sina_post.address = sina_post.address.apply(lambda x: re.sub('/[0-9]+،', '،', x))
mrud_and_sina_post = pd.concat([mrud, sina_post])
mrud_and_sina_post_text = ' split '.join(pd.unique(mrud_and_sina_post.address))

P = Preprocessor(dehKhoda.word, prefix, suffix)
P.train(mrud_and_sina_post_text, just_look_words=True)
processed_addresses = P.batch_run(mrud_and_sina_post.address.values)
processed_addresses = pd.DataFrame(processed_addresses.items(), columns=['address', 'clean'])
processed_addresses = mrud_and_sina_post.merge(processed_addresses, on='address', how='left')

cleaning ...
10/10
analysing prefix suffix ...
10/10
extracting short names ...


100%|██████████| 40/40 [00:12<00:00,  3.11it/s]
100%|██████████| 74/74 [00:18<00:00,  3.98it/s]
100%|██████████| 40/40 [00:12<00:00,  3.19it/s]


updating vocabulary ...


100%|██████████| 16/16 [18:09<00:00, 68.12s/it]


### Save

In [None]:
processed_addresses.to_csv(f'{HOME}/Data/processed_addresses.csv', index=False)
P.save(f'{HOME}/Models/prep')

## Load

In [None]:
P = Preprocessor.load(f'{HOME}/Models/prep')
processed_addresses = pd.read_csv(f'{HOME}/Data/processed_addresses.csv')

# NB3

In [None]:
class ParishLayerLM(NBLM) :
  def __init__(self) :
    super().__init__(smooth_factor=1/100000)

class AvenueLayerLM(PNBLM) :
  def __init__(self) :
    super().__init__(smooth_factor=1/100000, idf_power=1.5)

## Train

In [None]:
layers = ['parish','preaven_type','preaven_name','avenue_type','avenue_name']
using_columns = layers+['parcel']
post_details = pd.read_csv('sina_post_address_details.csv',usecols=using_columns)
for c in layers :
  post_details[c] = post_details[c].apply(lambda x: x.replace('/','_') if type(x)==type('') else str(x))
  post_details[c][post_details[c] == 'nan'] = ''
post_details['avenue'] = post_details.preaven_type.str.\
                     cat(post_details.preaven_name, sep=' ').str.\
                     cat(post_details.avenue_type, sep=' ').str.\
                     cat(post_details.avenue_name, sep=' ')
post_details = post_details.drop(columns=layers[1:])

df = processed_addresses[~processed_addresses.address.str.isdigit()]
df = df.merge(post_details, how='left', on='parcel')

nb3 = NB3("تهران",
          layers             = ['parish','avenue','clean'],
          data               = df,
          actual_labels_freq = post_details,
          NB_class           = [ParishLayerLM, AvenueLayerLM])

100%|██████████| 337/337 [27:46<00:00,  4.95s/it]


### Save

In [None]:
nb3.save(f'{HOME}/Models/nb3/')

## Load

In [None]:
nb3 = NB3.load(f'{HOME}/Models/nb3/',
                  parts_count=8, NB_class=[ParishLayerLM, AvenueLayerLM])

100%|██████████| 7/7 [04:31<00:00, 38.81s/it]


# BDE & AR

In [None]:
ade = ADE()
urban_hierarchy = json.load(open('urban_hierarchy.json'))
ar = AccurateRestorer(layers=['parish','avenue'],
                      urban_hierarchy=urban_hierarchy,
                      preprocessor=P,
                      prob_keyword='probability',
                      label_cond_thresholds=np.exp(-150),
                      plateno_cond_thresholds=np.exp(-200)
                      )

# PAT

In [None]:
pat = PAT(P, nb3, ade, ar, urban_hierarchy)

In [None]:
pat['شریعتی - ظفر - اطلسی - سیفیه شرقی - پ ۲۸']

{'PAT-understandable address': 'شریعت ظفر اطلسی سیف ی ه شرقی پلاک ۲۸',
 'appartement info': {'plateno': 28, 'floorno': None, 'unit': None},
 'most probable suggestion': {'probability': 0.999999277255864,
  'avenue': '  خيابان شهید مسعود سیفیه',
  'parish': 'داووديه'},
 'suggestions': [{'probability': 0.999999277255864,
   'avenue': '  خيابان شهید مسعود سیفیه',
   'parish': 'داووديه'},
  {'probability': 4.316337306333176e-161,
   'avenue': '  خيابان نساء',
   'parish': 'داووديه'},
  {'probability': 3.2187006313263777e-226,
   'avenue': '  خيابان شهید مسعودسیفیه غربی',
   'parish': 'داووديه'},
  {'probability': 1.0687756014031978e-302,
   'avenue': '  خيابان شهید حسن آقازاده فرد(اطلسی)',
   'parish': 'داووديه'},
  {'probability': 7.0266e-320,
   'avenue': '  خيابان شهید سرتیپ هوشنگ وحیددستگردی',
   'parish': 'داووديه'}],
 'possible postcodes': {'-1': ['1911713829'],
  '0': ['1911713830'],
  '1': ['1911713831', '1911713833'],
  '2': ['1911713834', '1911713835'],
  '3': ['1911713836', '191

In [None]:
pat['بزرگراه ستاری - بلوار فردوس شرقی - خیابان ابراهیمی جنوبی - نبش کوچه ۱۲ - پلاک ۱۴ - واحد ۲۴']

{'PAT-understandable address': 'بزرگ راه ستاری بلوار فردوس شرقی خیابان ابراهیمی جنوبی نبش کوچه ۱۲ پلاک ۱۴ واحد ۲۴',
 'appartement info': {'plateno': 14, 'floorno': None, 'unit': 24},
 'most probable suggestion': {'probability': 0.9831233767649038,
  'avenue': '  خيابان شهید ابراهیمی پوربسایی جنوبی',
  'parish': 'فردوس'},
 'suggestions': [{'probability': 0.9831233767649038,
   'avenue': '  خيابان شهید ابراهیمی پوربسایی جنوبی',
   'parish': 'فردوس'},
  {'probability': 0.01582352982876163,
   'avenue': '  خيابان ورزی شمالی',
   'parish': 'شهرک پرواز'},
  {'probability': 7.112743861817697e-129,
   'avenue': '  بزرگراه شهید سر لشکر ستاری',
   'parish': 'شهرک پرواز'},
  {'probability': 1.6418691105551065e-143,
   'avenue': '  خيابان ورزی جنوبی',
   'parish': 'شهرک پرواز'},
  {'probability': 2.3364873031175043e-156,
   'avenue': '  خيابان هجدهم شرقی',
   'parish': 'شهرک پرواز'},
  {'probability': 9.122241587004394e-178,
   'avenue': '  خيابان دوم غربی',
   'parish': 'شهرک پرواز'},
  {'probabi