In [6]:
'''
########################################################
## PROGRAM NAME: parse_zoning_txt.py                  ##
## PROJECT: NATIONAL ZONING AND LAND USE DATABASE     ##
## AUTHORS: MATT MLECZKO, SCOTT OVERBEY               ##
## DATE CREATED:                                      ##
## INPUTS:  User input .txt files                     ##
##          ZoningWeightedKeywords.csv                ##
##                                                    ##
## OUTPUTS:  .xls file with user-defined name         ##
##                                                    ##
## PURPOSE: Parse input zoning and land use text      ##
##          data and output database                  ##
########################################################

'''

'''
OVERALL PROCESS 

This program is a collection of functions. It is executed by running the finfun function, which triggers 
the rest of the nested functions. The one input argument for finfun is "filenames", which stores all the .txt files 
that are located in the input filepath that the user supplies. The finfun function then loops through these .txt 
files one at a time through "file", which is often the input into the nested functions of this program. The chronological 
sequence of functions once finfun is called is as follows

(1) finfun(file)
(2) getmatches
(3) getkeywords
(4) string_standardize
(5) fnote_fix
(6) threshold_mark
(7) matchvalue (nested in threshold_mark)
(8) densityinfo (also triggers fractonum and text2int)
(9) heightinfo (also triggers fractonum and text2int)
(10) parkinginfo (also triggers fractonum and text2int)
(11) resdis
(12) buildtablel1
(13) buildtablel2
(14) get_ts

TIPS FOR DEBUGGING 

This program and its constituent functions are written as a series of many loops. Oftentimes, the most straightforward
way of resolving an error in the code or determining how certain values are output is to print values along the sequence
of loops to determine how the code is handling a particular input text data. Some examples of this can be seen throughout 
the source code. Similarly, the code currently displays the filepath of each input file as it loops through all input files. 
The user is recommended to leave this as is since it helps determine which file is encountering an issue or error. The
same logic applies to printing particular values or signposts for particular functions or portions of functions. 

'''

###########################
## IMPORTANT USER INPUTS ##
###########################

# OUTPUT FILE NAME #
## NOTE: this file will save to same directory as this program ##
outputfilename = "Code_CA_Irvine.xls"

# INPUT FOLDER FILE PATH TO INPUT .TXT FILES #
filedirect = "C:/Users/clint/Desktop/nzlud/municipal_codes_all/CA/"

# INPUT FOLDER FILE PATH TO KEYWORDS FILE #
kwpath = "C:/Users/clint/Desktop/nzlud/"

# KEYWORD CSV FILE NAME (include extension) #
kwfile = "ZoningWeightedKeywords.csv"

###########################
###########################



In [7]:

## import necessary modules ##

import os
import csv
import pandas as pd
import regex as re
import xlwt
import glob
import statistics
from datetime import datetime
from statistics import mode
import collections
import itertools
from iteration_utilities import deepflatten


'''
The get_ts() function retrieves the timestamp associated with each input file from the input folder.
'''

def get_ts(file):

    st_path = os.path.normpath(str(file))
    print(st_path)

    x = os.path.getmtime(st_path)
    ts = datetime.fromtimestamp(x).strftime('%Y-%m-%d %H:%M:%S')
    #print(ts)
    return(ts)

'''
The get_keywords function retrieves keywords and their respective weights from a the input .csv file. This csv file 
contains a row for each question, and keywords are listed as keyword*weight. From these, it creates a dictionary with 
the words as keys and the weights as values. It requires no arguments, so to retrieve keywords and create a dictionary 
object, the user simply write dicts = get_keywords().
'''

def get_keywords():
    keyword_dict = {}
    with open(os.path.join(kwpath, kwfile), "rt") as csvfile:
        reader = csv.reader(csvfile)
        next(reader)
        qnum = 1
        for row in list(reader):
            keywords = {}
            for kw_pair_s in row[1:]:
                if (kw_pair_s != ""):
                    kw_pair = kw_pair_s.split("*")
                    if len(kw_pair) > 1:
                        k = kw_pair[0]
                        weight = kw_pair[1]
                        keywords[k] = weight
            keyword_dict[qnum] = keywords
            qnum += 1
    if len(keyword_dict) == 0:
        print("WARNING: No keywords found.")
    values = []
    keywords = keyword_dict
    values += keyword_dict.values()
    return values


'''
The string_standardize function is used to standardized words in the input text. For example,
'singlefamily', 'single family', 'single-family', etc. wil all be standardized to 'single family'. This makes 
the task of converting keywords to weights easier and faster.
'''

def string_standardize(strng):
    replacement_dict = {
        'amendment': ['amendments'],
        'acre': ['acres','ac\.'],
        ' percent': ['%'],
        'single family': ['single-family', 'singlefamily', 'single-family'],
        'multi family': ['multi-family', 'multifamily', 'multiple-family', 'multiple family'],
        'building permit': ['building permits'],
        'use permit': ['use permits'],
        'zoning permit': ['zoning permits'],
        'special permit': ['special permits'],
        'improvement location permit': ['improvement location permits'],
        'variation': ['variations'],
        'limit': ['limits', 'limited'],
        'application': ['applications'],
        'annual': ['annually'],
        'and': ['&'],
        'front': ['frt.'],
        'year': ['yearly'],
        'allocate': ['allocation'],
        'cap': ['capped', 'caps'],
        #'construction': ['construct', 'constructed','construction'],
        'pay': ['payment'],
        'fee': ['fees'],
        'provide': ['provided', 'provision'],
        'authorize': ['authorized', 'authorizing'],
        'grant': ['granted', 'granting'],
        'approve': ['approved', 'approving', 'approval'],
        'require': ['required', 'requires', 'requirement', 'requirements', 'requiring'],
        'issue': ['issued', 'issuance'],
        'reserve': ['reserved', 'reservation'],
        'dedicate': ['dedicated'],
        'building height': ['bldg. hgt.'],
        'designate': ['designated'],
        'incorporate': ['incorporated'],
        'moratorium': ['moratoria'],
        'minimum': ['min\.'],
        'maximum': ['max\.'],
        'permit': ['permits'],
        'plan': ['plans'],
        'variance': ['variances'],
        'lot': ['lots'],
        'dwelling unit': ['d\.u\.'],
        'dwelling unit per acre': ['du\/acre', 'dus/acre'],
        'unit': ['units'],
        'square feet': ['s\.f\.', 's\.\sf\.', 'sq\sft',
                        's\.feet', 's\.\sfeet', 's\.\sfeet', 'sq\.feet',
                        'sq\.\sfeet', 'sq\sfeet'],
        'mobile home park': ['m.h. - park'],
        'mobile home subdivision': ['m.h. - subdivision'],
        'acreage': ['acreage', 'Acreage'],
        'in lieu': ['in-lieu'],
        'set aside': ['set-aside']
    }  # update as needed
    for item in replacement_dict:
        for value in sorted(replacement_dict[item]):
            re_string = r'%s' % value
            strng = re.sub(re_string, r'%s' % item, strng)
    return strng

'''
The fnote_fix function removes any footnote indicators from numbers in the thousands. This is necessary since many 
input dimensional tables will list numbers in the thousands and these numbers often have footnote indicators attached
to them. For instance, say the number 1,000 with a footnote indicator is listed in a dimensional table. Without this
function, the number will be processed as 10005 instead of 1000. This function removes the 5 (and any other footnote
indicators in the case of multiple indicators) by catching instances of more than 3 numbers after a comma. 
'''

def fnote_fix(string):
    nums = re.findall(numbers, string, flags=re.IGNORECASE)

    nums_pr = [n for n in nums if "," in n]

    for n, num in enumerate(nums_pr):
        newnum = num.split(",")
        for i, j in enumerate(newnum):
            if len(j) > 3:
                newnum[i] = j[:3]
                finnum = "".join(newnum)
                string = re.sub(str(num), str(finnum), string)

    return string



## numbers is a regex meant to capture any instance of digit information ##
## code adapted from Wiktor Stribiżew from StackOverflow: https://stackoverflow.com/questions/39594066/using-regex-extract-all-digit-and-word-numbers ##

numbers = r"""(?x)          # Turn on free spacing mode
            (
              #^a(?=\s)|     # Here we match a at the start of string before  whitespace
              #[-]?[0-9]+[,.]?[0-9]*[\/][0-9]+[,.]?[0-9]*|  # new numbers
              (?<!-\d*\.*|\.|table\s\d*\.*\d*\.*\d*\.*)\b[0-9]+[,.]?[0-9]*|  # new numbers
              (?<!-\d*\.*|\.|table\s\d*\.*\d*\.*\d*\.*)\b\d*\.?\,?\\?\d+ # HERE we match one or more digits
              #\b            # Initial word boundary 
              #(?:
              #    one|two|three|four|five|six|seven|eight|nine|ten| 
              #    eleven|twelve|thirteen|fourteen|fifteen|sixteen|seventeen| 
              #    eighteen|nineteen|twenty|thirty|forty|fifty|sixty|seventy|eighty| 
              #    ninety|hundred|thousand|half
              #)             # A list of alternatives
              #\b            # Trailing word boundary
              )"""

## the following set of regex entries are meant to capture acre, square feet, dwelling unit, height, and parking info ##

acreinfo_s = r"(?:)\b((?<!unit\s)acre|ac\.|ac)\b"
sqftinfo_s = r"\b(square\sfeet|sf|s\.f\.|sq\.\sfeet|sq\sft|square|for\seach\sdwelling\sunit|sq\.\sfeet|per\sdwelling\sunit)\b"
unitinfo_s = r"""(?x)(?:\b(dwelling\sunit\sper\snet\sacre|unit\sper\snet\sacre|dwelling\sunit\sper\sacre|dwelling\sunit\sper\seach\s1\snet\sacre|
                    dwelling\sunit\sper\sacre|unit\sper\sacre|unit\/net\sacre|unit\sper\sgross\sacre|unit\sper\snet\splatted\sacre|
                    du\/gross\sacre|maximum\sdwelling\sunit\sper\sstructure|maximum\sdwelling\sunit\sper\sgross\sacre|
                    density\sper\sacre|
                    maximum\sdwelling\sunit\sper\sbuildable\sacre|up\sto\s\d+\s\d*\s*unit|up\sto\s\d+\sdwelling\sunit|square\sfeet\/du|
                    \d+\-\d+\sdwelling\sunit|dwelling\sper\sgross\sacre|minimum\snumber\sof\sunit|dwelling\sunit|dua)\b)"""
height_ft = r"""(?x)(?:\b((?<!square\s)feet|(?<!sq\s)ft)\b)"""
height_st = r"""(?x)\b(story|stories)\b"""
parkinfo = r"""(?x)\b(parking\sspace|parking\sspaces|parking\sspot|parking\sspots|parking|
                guest\sspace|per\sdu|per\sdwelling\sunit|per\sunit|minimum\sparking\srequire|
                for\seach\sdwelling\sunit|for\severy\sdwelling\sunit|for\seach\sapartment|
                minimum\sspaces\srequire|for\seach\sfamily|spaces)\b"""



In [8]:

'''
The get_matches function is the first step of the matching process. For each input file, it first completes a set of  
of text pre-processing tasks. It then finds any general keywords in biglist, which has keywords separated by 
question (measure), and then retrieves the keyword and the surrounding x characters, depending on the measure. This 
process is meant to narrow searches to codes related to zoning and land use. The output from this step is a list of lists. 

Next, the get_matches function looks for more specific keywords, each of which has an associated weight that 
will be used to determine the value of a particular measure indicator. Consequently, this step produces a list of 
lists of lists. The largest list is every match (represented by 'matches') which is the final return of get_matches. 
Within matches, there are 27 elements, all of which are lists, each one corresponding to a question (measure). Each of these 
question lists is then a list of the actual matches. This method was used to organize our matches by question (measure).

For example:
matches[0] would return a list of matches for question (measure) 1.
matches[0][0] would return the first matches for question (measure) 1, which would look something like 
['building permit','single family]

Hence, matches has three layers: 
Layer 1 = a list of 27 lists representing the 27 question/measures
Layer 2 = a list of matching strings for a particular question/measure
Layer 3 = a list of matching keywords for a particular question/measure 
'''

def getmatches(file1,sn):
    dicts = get_keywords()
    matches = list(range(27))  # placeholders to prevent an Index Error later on
    gen_matches = []
    matches_shell = []
    test = []
    ## list of lists of keywords for each measure ##
    terms = [r'limit', r'maximum', r'growth control',
                        r'dwellings',r'cap',r'growth',
                        r'dwelling units',r'calculation', r'construction',
                        r'annual', r'dwelling', r'year',
                        r'allowable', r'limit', r'allocate',
                        r'unit', r'moratorium', r'population',
                        r'approved', r'quota', r'scheduled development',
                        r'fixed', r'controlled', r'restricted',
                        r'no more', r'growth management',]
    biglist = [terms, 
               terms,
               terms,
               terms,
               terms,
               terms,
               terms,
               terms,
               terms,
               terms,
               terms,
               terms,
               terms,
               terms,
               terms,
               terms,
               terms,
               terms,
               terms,
               terms,
               terms,
               terms,
               terms,
               terms,
               terms,
               terms,
               terms]

    with open(file1, 'r', errors='replace') as file:
        lines = file.read()
        ## the following lines consists of a series of text pre-processing steps ##
        lines1 = re.sub(r'"\d+', '', lines)
        lines2 = re.sub(r'[^\x00-\x7f]', r'-', lines1)
        lines3 = ' '.join(lines2.split())
        lines4 = re.sub(r'\n', ' ', lines3)  # takes away all new line indicators and replaces with a space, fixed a problem where regex wasn't catching some words when ran in the loop
        lines5 = re.sub(r'\t', ' ', lines4)  # removes \t characters
        lines6 = re.sub(r'\\', '', lines5)
        lines7 = re.sub(r'http\S+', '', lines6)
        lines8 = re.sub(r'\d+\.\d{3,}\.*\d*\.*\d*|\d+\.\d+\.\d+|\+\-+|\-{3,}', '', lines7)
        lines9 = re.sub(r'\[\d+\]', '', lines8)
        lines10 = re.sub(r'=', ' ', lines9)
        lines11 = re.sub(r'\bsf\b', "square feet", lines10)
        lines12 = re.sub(
            r'\b(Sterling\sCodifiers\,\sInc\.|Article|Chapter|Section|SECTION|Sections|Subsection|Sec|Prior\scode|Code|Ordinance|Ord|Lots|through|pg|pgs|Part)\b\.*\s*(No\.)*\s*\d*\w*\:*\.*\-*\d*\-*\d*\,*\.*\s*(and)*\s*\d*\-*\d*\-*\d*\,*\s*(and)*\s*\d*\-*\d*\-*\d*',
            '', lines11)
        lines13 = re.sub(r'\bAmended\sby\sOrd\.\sNo\.\s\d+\,\s\d+\/d+\/\d+|Amended\s\d{4}\b|amended\s\d{1,2}\/\d{1,2}\/\d{1,4}|\d+\/\d+\/\d+', '', lines12)
        lines14 = re.sub(r'\d+\:\d+', '', lines13)
        lines15 = lines14.lower()
        lines16 = re.sub(r'\b(january|february|march|april|may|june|july|august|september|october|november|december)\b\s\d+\,*\s\d{4}', '', lines15)
        lines17 = re.sub(r'pg\.|pgs\.\s\d+\-\d+|page\s\d+', '', lines16)
        lines18 = re.sub(r'\bft\.|\bft\b', "feet ", lines17)
        lines19 = string_standardize(str(lines18))
        lines20 = fnote_fix(str(lines19))
        lines21 = re.sub(r'[",!?*\[\]]', '', lines20)
        lines22 = re.sub(r';', ' ', lines21)
        lines23 = re.sub(r'\/acre|unit\/acre|unit\/net\sacre|unit\/gross\sacre|du\/ac\b', "unit per acre", lines22)
        lines24 = re.sub(r'\d+\/\d+\/\d+\ssterling\scodifiers\sinc\.', '', lines23)

        test_str = []

        biglist1 = biglist  # copied for use in subsetting match data in list by question (e.g., element one in big list 1 will be question 1 here, but later question 1 will be replaced by all matches for question 1)
        for question in biglist:
            for keyword in question:
                #print("KEYWORD")
                #print(keyword)
                kws = re.findall(str(keyword), lines24, flags=re.IGNORECASE)
                if not kws:
                    continue
                #print("MATCHING KEYWORD")
                #print(kws)
                kwpos = [m.start(0) for m in re.finditer(keyword, lines24, flags=re.IGNORECASE)]
                #print("MATCHING KEYWORD POSITIONS")
                #print(kwpos)
                if keyword in ['zoning district', 'zoning districts', 'residence zone', 'residence zones',
                               'zone district', 'zone districts', 'residential zones', 'residential zone',
                               'residential district', 'residential districts', 'dwelling zone', 'multiuse zone',
                               'classes of districts', 'district that is designed to', 'dwelling district:',
                               'residence districts', 'residence district', 'multi family residential',
                               'single family residential', 'single residential', 'multiple residential',
                               'zone dwelling family size', 'housing (four stories or less) district',
                               'residential single family district', 'residential multi family district',
                                'mid-rise district', 'high-rise district', 'mixed use zone', 'overlay district',
                               'three-family district', 'three family district', 'residential detached zones',
                               'housing district', 'housing districts', 'residential overlay',
                               'use regulation schedule', 'one-family zone', 'multi family zone',
                               'residential classifications', 'district regulations', 'creation of districts',
                               'r1 district', 'r2 district', 'r3 district', 'r4 district', 'r5 district',
                               'r6 district', 'r7 district', 'land use districts', 'rm district',
                               'r-1 district', 'r-2 district', 'r-3 district', 'r-4 district',
                               're residential-existing district', 'conservation district', 'r-16 district',
                               'low density residential', 'medium density residential', 'density residential',
                               'residence a-1', 'residence a-2', 'low-rise', 'medium-rise', 'high-rise'
                               'residence a district', 'residence aa district', 'residence b district',
                               'residence bb district',
                               'residence c-1 district', 'residence c-2 district', 'residence cc district',
                               'residence d district', 'residence dd district', 'residence e district',
                               'residence ee district',
                               'residence f district', 'residence ff district', 'residence k district'
                               'residential high density', 'residential medium density', 'rural residential',
                               'residential one acre', 'residential two acre',
                               'residential r-2 district', 'intensity regulations', 'residential-general district',
                               'general residence district', 'residential use district',
                               'residential r-3 district', 'residential r-4 district', 'dimension regulations',
                               'low density-residential', 'medium-density residential',
                               'medium-high-density residential',
                               'residential urban zone', 'residential flexible zone',
                               'urban residence', 'suburban residence',
                               'residential urban district', 'residential suburban district',
                               'residential limited business district',
                               'dimensional require', 'dimensional and density regulations', 'dimension restrictions',
                               'development standards', 'residential zones', 'schedule of', 'dimensional regulations',
                               'dimensional standards', 'bulk and replacement', 'district design require',
                               'height and area require', 'height and area regulations', 'height and lot require',
                               'lot and bulk standards', 'lot standards by zone', 'development regulations',
                               'lot dimension and intensity standards', 'density and bulk require', 'area and bulk standards',
                               'bulk and placement regulations', 'district regulations', 'bulk require',
                               'minimum lot size per dwelling unit', 'lot require', 'area and bulk schedule',
                               'land space require', 'bulk regulations', 'lot area frontage and yard require',
                               'yard and height require', 'lot standards matrix', 'other dimensions and space require',
                               'area, yard and height regulations', 'bulk and area standards', 'density schedule',
                               'development criteria district', 'zone standards', 'height limit lot sizes and coverage',
                               'low density district', 'medium density district', 'high density district',
                               'rural density district', r'site dimensions',
                               'r-1 residential.', 'r-2 residential.', 'r-3 residential.',
                               'bulk and area regulations', 'land use district and allowable uses',
                               'bulk and setback regulations', 'intensity of use', 'dimensional controls',
                               'residential bulk chart', 'bulk matrix', 'residential uses and require',
                               'standards for principal buildings on individual lots', 'lot and yard require'
                               'lot yard and density regulations', 'area setback and height require',
                               'zoning district regulation chart', 'height area and yard require',
                               'area yard and height standards', 'bulk and coverage controls', r'spatial require',
                               'lot yard area and height require', 'area yard and height require',
                               'height and yard require', 'bulk yard and space require','bulk and yard regulations',
                               'table of allowed uses', 'table of permitted uses','use table',
                               'density dimensions and other standards', 'area yard and height regulations',
                               'districts:', 'density and intensity limit','bulk schedules']:
                    new_kwpos = [[p - 300, p + 2000] for p in kwpos]
                    for n, t in enumerate(new_kwpos):
                        t = [0 if x < 0 else x for x in t]
                        t = [len(lines24) if x > len(lines24) else x for x in t]
                        new_kwpos[n] = t

                    test_str = []

                    for r in new_kwpos:
                        test_str.append(lines24[r[0]:r[1]])

                elif keyword in ['parking spots', 'parking spaces', 'parking', 'off-street spaces require',
                                 'minimum parking require', 'minimum spaces require', 'vehicle', 'one space for']:
                    new_kwpos = [[p - 750, p + 750] for p in kwpos]
                    for n, t in enumerate(new_kwpos):
                        t = [0 if x < 0 else x for x in t]
                        t = [len(lines24) if x > len(lines24) else x for x in t]
                        new_kwpos[n] = t

                    test_str = []

                    for r in new_kwpos:
                        test_str.append(lines24[r[0]:r[1]])

                elif keyword in ['district']:
                    if any(el in lines24 for el in ['zoning district', 'zoning districts', 'residence zone', 'residence zones',
                               'zone district', 'zone districts', 'residential zones', 'residential zone',
                               'residential district', 'residential districts', 'dwelling zone', 'multiuse zone',
                               'classes of districts', 'district that is designed to',
                               'residence districts', 'residence district', 'multi family residential',
                               'single family residential', 'single residential', 'multiple residential',
                               'zone dwelling family size', 'housing (four stories or less) district',
                               'residential single family district', 'residential multi family district',
                                'mid-rise district', 'high-rise district', 'mixed use zone', 'overlay district',
                               'three-family district', 'three family district', 'residential detached zones',
                               'housing district', 'housing districts', 'residential overlay',
                               'use regulation schedule', 'one-family zone', 'multi family zone',
                               'residential classifications', 'district regulations', 'creation of districts',
                               'r1 district', 'r2 district', 'r3 district', 'r4 district', 'r5 district',
                               'r6 district', 'r7 district', 'land use districts', 'rm district',
                               'r-1 district', 'r-2 district', 'r-3 district', 'r-4 district',
                               're residential-existing district', 'conservation district', 'r-16 district',
                               'low density residential', 'medium density residential',
                               r'residence a-1', r'residence a-2', r'low-rise', r'medium-rise', r'high-rise'
                               'residence a district', 'residence aa district', 'residence b district',
                               'residence bb district',
                               'residence c-1 district', 'residence c-2 district', 'residence cc district',
                               'residence d district', 'residence dd district', 'residence e district',
                               'residence ee district',
                               'residence f district', 'residence ff district', 'residence k district'
                               "residential r-2 district", "intensity regulations", "residential-general district",
                               "general residence district",
                               "residential r-3 district", "residential r-4 district"]):
                        continue
                    else:
                        new_kwpos = [[p - 250, p + 250] for p in kwpos]
                        for n, t in enumerate(new_kwpos):
                            t = [0 if x < 0 else x for x in t]
                            t = [len(lines24) if x > len(lines24) else x for x in t]
                            new_kwpos[n] = t
                        
                    test_str = []

                    for r in new_kwpos:
                        test_str.append(lines24[r[0]:r[1]])

                else:
                    new_kwpos = [[p - 250, p + 250] for p in kwpos]
                    for n, t in enumerate(new_kwpos):
                        t = [0 if x < 0 else x for x in t]
                        t = [len(lines24) if x > len(lines24) else x for x in t]
                        new_kwpos[n] = t

                    test_str = []

                    for r in new_kwpos:
                        test_str.append(lines24[r[0]:r[1]])

                gen_matches.append(test_str)

        # Exporting gen_matches as a txt file
        with open('B_gen_matches.txt', 'w') as file:
            for match in gen_matches:
                file.write(str(match) + '\n')  


            biglist1[biglist.index(
                question)] = gen_matches  # puts matches for question in that questions element number in biglist1
            gen_matches = []
            # matches = []
        for i in range(len(biglist1)):  # each element is matches for a specific question
            dict = dicts[i]
            wordslist = list(dict.keys())
            regex2 = re.compile("(?=(\\b" + "\\b|\\b".join(map(re.escape, wordslist)) + "\\b))", flags=re.IGNORECASE)
            out_matches = []
            for words in biglist1[i]:
                in_matches = []
                if len(words) > 0:
                    for strings in words:
                        matches_shell.append(re.findall(regex2, strings))
                    in_matches = matches_shell
                else:
                    in_matches = []
                matches_shell = []
                out_matches.append(in_matches)
            matches[i] = out_matches

        numdouble = 0



    return [matches]


In [9]:

'''
fractonum is a function to convert any numeric information stored as fractions into digits. The input is a 
captured string from get_matches. The output is the equivalent string if no fraction is found and the equivalent
string with the converted fraction if one is found. The code accounts for a number of different fraction formats, 
including mixed fractions and fractions expressed in words.
'''

def fractonum(string):
    frac1 = {"zero": 0, "one": 1, "two": 2, "three": 3, "four": 4,
             "five": 5, "six": 6, "seven": 7, "eight": 8, "nine": 9,
             "ten": 10, "eleven": 11, "twelve": 12, "thirteen": 13,
             "fourteen": 14, "fifteen": 15, "sixteen": 16,
             "seveneteen": 17, "eighteen": 18, "nineteen": 19,
             "half": 2, "third": 3, "thirds": 3, "fourth": 4, "fourths": 4,
             "fifth": 5, "fifths": 5, "sixth": 6, "sixths": 6, "seventh": 7, "sevenths": 7,
             "eighth": 8, "eighths": 8, "ninth": 9, "ninths": 9, "tenth": 10, "tenths": 10}

    frac2 = {"half": 0.5, "third": 0.67, "quarter": 0.25, "fourth": 0.25, "fifth": 0.2,
             "sixth": 0.16667}

    newstring = string.split()

    skf = 0

    for i, ele in enumerate(newstring):
        if ele.count("/") == 1:
            cv = ele.find('/', 0, len(ele))
            if re.findall(r'du|\/unit', ele, flags=re.IGNORECASE):
                newstring[i] = " ".join(ele.split("/"))

    for i, word in enumerate(newstring):
        idx = i

        spword = word.split("/")

        if word in ["feet/stories", "stories/feet"]:
            skf = 1
        if skf == 1:

            if spword[0].isdigit() == False and word not in ["feet/stories", "stories/feet"]:
                skf = 0

        if word == "one-and-a-half":
            newstring[i] = "1.5"
        elif i != len(newstring) - 1 and newstring[i-2] == "one" and newstring[i-1] == "and" and newstring[i] == "one" and newstring[i + 1] == "half":
            newstring[i-2] = ""
            newstring[i-1] = ""
            newstring[i + 1] = ""
            newstring[i] = "1.5"
        elif i != len(newstring) - 1 and newstring[i] == "one" and newstring[i + 1] == "half":
            newstring[i] = "0.5"
            newstring[i + 1] = ""
        elif i != len(newstring) - 1 and newstring[i] == "a" and newstring[i + 1] == "half":
            newstring[i] = "0.5"
            newstring[i + 1] = ""
        elif i != len(newstring) - 1 and newstring[i] == "one" and newstring[i + 1] == "third":
            newstring[i] = "0.33"
            newstring[i + 1] = ""
        elif i != len(newstring) - 1 and newstring[i] == "one" and newstring[i + 1] == "quarter":
            newstring[i] = "0.25"
            newstring[i + 1] = ""
        if word.count("/") == 1:
            new_word_in = word.split('/')
            new_word = [elem.replace(')', '') for elem in new_word_in]
            if sum(c.isdigit() for c in new_word[0]) > 1 and new_word[0].find(".", 0, len(new_word[0])) == -1:
                if "-" in new_word[0]:
                    new_new_word = new_word[0].split("-")
                else:
                    new_new_word = new_word[0].split()
                if len(new_new_word) > 2:
                    continue
                if len(new_new_word) > 1 and new_new_word[1].isdigit() and new_word[1].isdigit() and float(
                        new_word[1]) != 0:
                    if float(new_word[1]) != 0:
                        num = float(new_new_word[1])
                        den = float(new_word[1])
                        rep = round(num / den, 2)
                        newstring[i] = ''
                        in_rep = re.sub("[^0-9]", "", new_new_word[0])
                        if in_rep:
                            newstring[idx] = str(float(in_rep.replace('(', '')) + rep)
                elif "-" in new_word[1]:
                    rep_new_word = new_word[1].split("-")
                    if rep_new_word[0].isdigit() and rep_new_word[1] == "acre" and float(rep_new_word[0]) != 0:
                        if len(new_new_word) == 1:
                            num = float(new_new_word[0].replace('(',''))
                        elif len(new_new_word) > 1:
                            num = float(new_new_word[1].replace('(',''))
                        den = float(rep_new_word[0])
                        rep = round(num / den, 2)
                        in_rep = re.sub("[^0-9]", "", new_new_word[0])
                        newstring[idx] = str(float(in_rep) + rep) + " " + rep_new_word[1]
                elif len(new_new_word) == 1 and new_new_word[0].isdigit() and new_word[1].isdigit() and float(
                        new_word[1]) != 0 and skf == 0:
                    num = float(new_word[0])
                    den = float(new_word[1])
                    rep = round(num / den, 2)
                    newstring[idx] = str(float(rep))
                elif skf == 1:
                    num1 = float(new_word[0])
                    num2 = float(new_word[1])
                    rep = max(num1,num2)
                    newstring[idx] = str(float(rep))
            elif newstring[idx - 1].isdigit():
                if new_word[0].isdigit() and new_word[1].isdigit() and float(new_word[1]) != 0:
                    num = float(new_word[0])
                    den = float(new_word[1])
                    rep = round(num / den, 2)
                    newstring[i] = ''
                    newstring[idx - 1] = str(float(newstring[idx - 1]) + rep)
                elif "-" in new_word[1] and new_word[0].isdigit():
                    rep_new_word = new_word[1].split("-")
                    if rep_new_word[0].isdigit() and rep_new_word[1] == "acre" and float(rep_new_word[0]) != 0:
                        num = float(new_word[0])
                        den = float(rep_new_word[0])
                        rep = round(num / den, 2)
                        newstring[i] = ''
                        newstring[idx - 1] = str(float(newstring[idx - 1]) + rep) + " " + rep_new_word[1]
            else:
                new_word = word.split('/')
                if new_word[0].isdigit() and new_word[1].isdigit() and float(new_word[1]) != 0:
                    newstring[idx] = str(float(new_word[0]) / float(new_word[1]))
                elif new_word[0].isdigit() and "-" in new_word[1]:
                    rep_new_word = new_word[1].split("-")
                    if rep_new_word[0].isdigit() and rep_new_word[1] == "acre" and float(rep_new_word[0]) != 0:
                        newstring[idx] = str(float(new_word[0]) / float(rep_new_word[0])) + " " + rep_new_word[1]
        if '-' in word:
            new_word = word.split('-')
            if newstring[idx - 1] == "and" and newstring[idx - 2] in frac1:
                if set([new_word[0], new_word[1]]).issubset(set(frac1)) and float(frac1[new_word[1]]) != 0:
                    num = frac1[new_word[0]]
                    den = frac1[new_word[1]]
                    rep = round(num / den, 2)
                    newstring[i] = ''
                    newstring[idx - 1] = ''
                    newstring[idx - 2] = str(frac1[newstring[idx - 2]] + rep)
                elif new_word[0] in frac1 and new_word[1] in frac2 and float(frac2[new_word[1]]) != 0:
                    num = frac1[new_word[0]]
                    den = frac2[new_word[1]]
                    rep = round(num * den, 2)
                    newstring[i] = ''
                    newstring[idx - 1] = ''
                    newstring[idx - 2] = str(frac1[newstring[idx - 2]] + rep)
                else:
                    continue
            elif set([new_word[0], new_word[1]]).issubset(set(frac1)) and float(frac1[new_word[1]]) != 0:
                num = frac1[new_word[0]]
                den = frac1[new_word[1]]
                rep = str(round(num / den, 2))
                newstring[i] = rep
            else:
                continue
        elif word in frac2:
            st = ["story","stories"]
            if idx + 3 <= len(newstring)-1:
                if newstring[idx + 1] in st or newstring[idx+3] in st:
                    rep = 1/frac2[newstring[idx]]
                    newstring[i] = str(round(rep))
            elif idx + 1 <= len(newstring)-1:
                if newstring[idx + 1] in st:
                    rep = 1/frac2[newstring[idx]]
                    newstring[i] = str(round(rep))
            elif newstring[idx - 1] == "a" and newstring[idx - 2] == "and" and newstring[idx - 3] in frac1:
                rep = frac2[newstring[idx]]
                newstring[idx], newstring[idx - 1], newstring[idx - 2] = '', '', ''
                newstring[idx - 3] = str(frac1[newstring[idx - 3]] + rep)
            elif newstring[idx - 2] == "and" and newstring[idx - 3] in frac1 and newstring[idx - 1] in frac1:
                rep = frac1[newstring[idx - 1]] * frac2[newstring[idx]]
                newstring[idx], newstring[idx - 2], newstring[idx - 1] = '', '', ''
                newstring[idx - 3] = str(frac1[newstring[idx - 3]] + rep)
            else:
                newstring[i] = str(frac2[word])

    finstring = ' '.join(newstring)

    return finstring

'''
the text2int function is adapted from code written by someone with the username "recursive" from the StackOverflow
post here: https://stackoverflow.com/questions/493174/is-there-a-way-to-convert-number-words-to-integers

It essentially converts numeric info expressed words into digits. The input is a string and the output is the same string
with converted numeric information, if any. 

'''

def text2int(textnum, numwords={}):
    if not numwords:
        units = [
            "zero", "one", "two", "three", "four", "five", "six", "seven", "eight",
            "nine", "ten", "eleven", "twelve", "thirteen", "fourteen", "fifteen",
            "sixteen", "seventeen", "eighteen", "nineteen",
        ]

        tens = ["", "", "twenty", "thirty", "forty", "fifty", "sixty", "seventy", "eighty", "ninety"]

        scales = ["hundred", "thousand", "million", "billion", "trillion"]

        for idx, word in enumerate(units):    numwords[word] = (1, idx)
        for idx, word in enumerate(tens):     numwords[word] = (1, idx * 10)
        for idx, word in enumerate(scales):   numwords[word] = (10 ** (idx * 3 or 2), 0)

    decs = {"tenths": 0.1, "hundredths":0.01, "thousandths":0.001}
    ordinal_words = {'first': 1, 'second': 2, 'third': 3, 'fifth': 5, 'eighth': 8, 'ninth': 9, 'twelfth': 12}
    ordinal_endings = [('ieth', 'y'), ('th', '')]

    thwords = ['fourth', 'fifth', 'sixth', 'seventh', 'eigth', 'ninth', 'tenth', 'eleventh',
               'twelvth', 'thirteenth', 'fourteenth', 'fifteenth', 'sixteenth', 'seventeenth',
               'eighteenth', 'nineteenth']


    current = result = 0
    curstring = ""
    onnumber = False
    wstring = textnum.split()
    for w, word in enumerate(wstring):
        ## added ##
        if word.count('-') == 1:
            x = word.split("-")
            newnum1 = x[0]
            newnum2 = x[1]
            if newnum1 != "" and newnum1 in numwords and newnum2 != "" and newnum2 in numwords:
                scale1, increment1 = numwords[newnum1]
                scale2, increment2 = numwords[newnum2]
                current = current * scale1 * scale2 + increment1 + increment2
                onnumber = True
            elif current != 0:
                result += current
                curstring += repr(result) + " " + word + " "
                current = result = 0
                onnumber = False
            else:
                curstring += word + " "
        elif word in decs:
            scale, increment = (decs[word], 0)
            current = round(current * scale + increment,2)
        elif word in ordinal_words:
            scale, increment = (1, ordinal_words[word])
            current = current * scale + increment
            if scale > 100:
                result += current
                current = 0
            onnumber = True
        else:
            for ending, replacement in ordinal_endings:
                if word.endswith(ending) and word in thwords:
                    word = "%s%s" % (word[:-len(ending)], replacement)

            if word not in numwords:
                if onnumber:
                    curstring += repr(result + current) + " "
                if word.isnumeric() and w!=len(wstring)-1 and wstring[w+1] == "hundredths":
                    curstring += str(round(float(word) * 0.01, 2)) + " "
                    result = current = 0
                    onnumber = False
                else:
                    curstring += word + " "
                    result = current = 0
                    onnumber = False
            else:
                scale, increment = numwords[word]

                current = current * scale + increment
                if scale > 100:
                    result += current
                    current = 0
                onnumber = True

    if onnumber:
        curstring += repr(result + current)

    return curstring

'''
the chunks() function is used to iterate through a list of rows in the buildtable functions
'''

def chunks(s, n):
    for start in range(0, len(s), n):
        yield s[start:start + n]



In [10]:



'''
The finfun function runs the entire program and produces the output data. First, it writes the column names of the 
output spreadsheet manually. Next, it iterates through each file by calling all of the functions above. Each input file
takes on a new row of the output data and each column takes on a value according to the output of a particular nested
function above.

Some files may give a UnicodeDecodeError, meaning that they are encoded in something other than the standard format. These
files will be stored as blank rows in the main 'Municipalities' sheet and they are also added to a second sheet labeled
'Error Cities'.

Since the other functions described above are nested within the finfun function, the only function needed to call
is finfun(list of files).

'''

def finfun(filenames):
    # print(biglist1)
    outxl = xlwt.Workbook()
    outsheet = outxl.add_sheet('Municipalities', cell_overwrite_ok=True)
    errorsheet = outxl.add_sheet('Error Cities')
    outsheet.write(0, 0, "muni")
    outsheet.write(0, 1, "restrict_sf_permit")
    outsheet.write(0, 2, "restrict_mf_permit")
    outsheet.write(0, 3, "limit_sf_units")
    outsheet.write(0, 4, "limit_mf_units")
    outsheet.write(0, 5, "limit_mf_dwellings")
    outsheet.write(0, 6, "limit_mf_dwelling_units")
    outsheet.write(0, 7, "min_lot_size")
    outsheet.write(0, 8, "max_density")
    outsheet.write(0, 9, "open_space")
    outsheet.write(0, 10, "inclusionary")
    outsheet.write(0, 11, "council_nz")
    outsheet.write(0, 12, "planning_nz")
    outsheet.write(0, 13, "countybrd_nz")
    outsheet.write(0, 14, "pubhlth_nz")
    outsheet.write(0, 15, "site_plan_nz")
    outsheet.write(0, 16, "env_rev_nz")
    outsheet.write(0, 17, "council_rz")
    outsheet.write(0, 18, "planning_rz")
    outsheet.write(0, 19, "zoning_rz")
    outsheet.write(0, 20, "countybrd_rz")
    outsheet.write(0, 21, "countyzone_rz")
    outsheet.write(0, 22, "townmeet_rz")
    outsheet.write(0, 23, "env_rev_rz")
    outsheet.write(0, 24, "adu")
    outsheet.write(0, 25, "half_acre_less")
    outsheet.write(0, 26, "half_acre_more")
    outsheet.write(0, 27, "one_acre_more")
    outsheet.write(0, 28, "two_acre_more")
    outsheet.write(0, 29, "max_den_cat1")
    outsheet.write(0, 30, "max_den_cat2")
    outsheet.write(0, 31, "max_den_cat3")
    outsheet.write(0, 32, "max_den_cat4")
    outsheet.write(0, 33, "max_den_cat5")
    outsheet.write(0, 34, "height_ft_median")
    outsheet.write(0, 35, "height_ft_mode")
    outsheet.write(0, 36, "height_st_median")
    outsheet.write(0, 37, "height_st_mode")
    outsheet.write(0, 38, "parking_median")
    outsheet.write(0, 39, "parking_mode")
    outsheet.write(0, 40, "mf per")
    outsheet.write(0, 41, "timestamp")
    errorsheet.write(0, 0, 'muni')
    startrow = 1
    errorrow = 1
    for file in filenames:
        try:
            print(file)
            startcol = 1
            outsheet.write(startrow, 0, file)
            getmatches_res = getmatches(file,startrow)

            print(file, "post sheet")
            startrow += 1
        except UnicodeDecodeError:
            errorsheet.write(errorrow, 0, file)
            errorrow += 1
            startrow += 1
            print(file, "hit decode error")
    outxl.save(outputfilename)



filenames = sorted(glob.glob(filedirect + "*.txt")) # finds all .txt files in the folder your code is saved in
finfun(filenames)  # the final function




C:/Users/clint/Desktop/nzlud/municipal_codes_all/CA\CA_Irvine.txt
C:/Users/clint/Desktop/nzlud/municipal_codes_all/CA\CA_Irvine.txt post sheet
