# SAS File Description Parser

**Usage:**
- Run this cell in Notebook located in the same folder as the SAS definition file
- Define the file name in the first cell
- Run all 3 cells
- The result is displayed at the end

In [1]:
# Modules and Filename config
import pandas as pd
import logging
import sys
import json

file_name = 'I94_SAS_Labels_Descriptions.SAS'

In [2]:
# Small and unreliable SAS config parser
logging.basicConfig(
    level=logging.WARNING,
    format='%(asctime)s %(levelname)s \t %(message)s ',
    datefmt='%Y-%m-%d %H:%M:%S',
    stream=sys.stdout,
)
log = logging.getLogger('log')

def get_sas_definitions(file):
    """ Checks a given .SAS file line by line searching for text patterns.
        Idenfified variables are collected in a dictionary.
        Value constraints are stored as <variable label>+'l'
        
        Returns:  dict with variables, variable discriptions 
                  and value constraints
    
    """
    # Empty dict to return
    data_dict = dict()
    # Flag if we found a single line variable description
    confline = False
    # Flag if we found a multiple line variable description
    multiline = False
    # Placeholder for variable name
    varname = ''
    
    with open(file,mode='r') as f:
        for line in f.read().splitlines():
            line = line.strip()
            # Recognize data variable descriptions by "/*" at the start of the line
            
            # If we have the separator at beginning and end then we can store a variable
            if ((line.startswith('/* ')) and (line.endswith(' */'))):
                multiline = False
                line = line.lstrip('/* ').rstrip(' */')
                splitted = line.split(sep='-')
                colnames = splitted[0].strip().lower()
                desc = splitted[1].strip()
                if '&' in colnames:
                    colnames = colnames.split(sep='&')
                    for cn in colnames:
                        cn = cn.strip()
                        data_dict[cn] = desc
                        log.info('Found Field Description for column {}: {}'.format(cn, desc))
                else:
                    data_dict[colnames] = desc
                    log.info('Found Field Description for column {}: {}'.format(colnames, desc))
            
            # If its just the start of a multiline description then store the first part
            # of the description and set the multiline flag
            elif line.startswith('/* '):
                multiline = True
                line = line.lstrip('/* ').rstrip()
                try:
                    splitted = line.split(sep='-', maxsplit=1)
                    colnames = splitted[0].strip().lower()
                    desc = splitted[1].strip()
                except:
                    splitted = line.split(sep=' ', maxsplit=1)
                    colnames = splitted[0].strip().lower()
                    desc = splitted[1].strip()
                if '&' in colnames:
                    colnames = colnames.split(sep='&')
                    for cn in colnames:
                        cn = cn.strip()
                        data_dict[cn.lower()] = desc
                        log.info('Found Field Description for column {}: {}'.format(cn, desc))
                else:
                    data_dict[colnames] = desc
                    log.info('Found Field Description for column {}: {}'.format(colnames, desc))
            else:
                # If a multiline description was started then just append the line
                if (multiline == True and line.endswith(' */') == True):
                    multiline = False
                    line = line.lstrip('/* ').rstrip()
                    if '&' in colnames:
                        colnames = colnames.split(sep='&').lower()
                        for cn in colnames:
                            cn = cn.strip()
                            data_dict[cn] = data_dict[cn] + line
                            log.info('Appending Field Description for column {}'.format(cn))
                    else:
                        data_dict[colnames] = data_dict[colnames] + desc
                        log.info('Appending Field Description for column {}'.format(colnames))
                # Except its the end of the multiline description
                elif (multiline == True and line.endswith(' */') == False):
                    multiline = True
                    line = line.lstrip('/* ').rstrip()
                    if '&' in colnames:
                        colnames = colnames.split(sep='&').lower()
                        for cn in colnames:
                            cn = cn.strip()
                            data_dict[cn] = data_dict[cn] + line
                            log.info('Appending Field Description for column {}'.format(cn))
                    else:
                        data_dict[colnames] = data_dict[colnames] + desc
                        log.info('Appending Field Description for column {}'.format(colnames))
                # The term "value" marks the beginning of a value constraint section
                elif line.startswith('value'):
                    confline = True
                    parameters = dict()
                    line = line.split(' ')
                    varname = line[1].rstrip('l')
                # In the section each constraint is assigned with a "="
                elif ('=' in line and confline == True):
                    line = line.split('=')
                    key = line[0].strip().strip('\'')
                    value = line[1].strip().strip('\'')
                    data_dict[(varname, key)] = value
                    logging.info('Getting parameter value for variable {}: {} {}'.format(varname, key, value))
                # An empty line marks the end of the constraint section
                elif len(line) == 0:
                    confline = False
                else:
                    next
    return data_dict

result_dict = get_sas_definitions(file_name)

In [29]:
for key, value in result_dict.items():
    if type(key) == tuple:
        print(key[0], ';', key[1], ';', value)

i94cnty ; 582 ; MEXICO Air Sea, and Not Reported (I-94, no land arrivals)
i94cnty ; 236 ; AFGHANISTAN
i94cnty ; 101 ; ALBANIA
i94cnty ; 316 ; ALGERIA
i94cnty ; 102 ; ANDORRA
i94cnty ; 324 ; ANGOLA
i94cnty ; 529 ; ANGUILLA
i94cnty ; 518 ; ANTIGUA-BARBUDA
i94cnty ; 687 ; ARGENTINA 
i94cnty ; 151 ; ARMENIA
i94cnty ; 532 ; ARUBA
i94cnty ; 438 ; AUSTRALIA
i94cnty ; 103 ; AUSTRIA
i94cnty ; 152 ; AZERBAIJAN
i94cnty ; 512 ; BAHAMAS
i94cnty ; 298 ; BAHRAIN
i94cnty ; 274 ; BANGLADESH
i94cnty ; 513 ; BARBADOS
i94cnty ; 104 ; BELGIUM
i94cnty ; 581 ; BELIZE
i94cnty ; 386 ; BENIN
i94cnty ; 509 ; BERMUDA
i94cnty ; 153 ; BELARUS
i94cnty ; 242 ; BHUTAN
i94cnty ; 688 ; BOLIVIA
i94cnty ; 717 ; BONAIRE, ST EUSTATIUS, SABA
i94cnty ; 164 ; BOSNIA-HERZEGOVINA
i94cnty ; 336 ; BOTSWANA
i94cnty ; 689 ; BRAZIL
i94cnty ; 525 ; BRITISH VIRGIN ISLANDS
i94cnty ; 217 ; BRUNEI
i94cnty ; 105 ; BULGARIA
i94cnty ; 393 ; BURKINA FASO
i94cnty ; 243 ; BURMA
i94cnty ; 375 ; BURUNDI
i94cnty ; 310 ; CAMEROON
i94cnty ; 326 ; CA

In [28]:
result_dict

{'i94yr': '4 digit year',
 'i94mon': 'Numeric month',
 'i94cit': 'This format shows all the valid and invalid codes for processing',
 'i94res': 'This format shows all the valid and invalid codes for processing',
 ('i94cnty',
  '582'): 'MEXICO Air Sea, and Not Reported (I-94, no land arrivals)',
 ('i94cnty', '236'): 'AFGHANISTAN',
 ('i94cnty', '101'): 'ALBANIA',
 ('i94cnty', '316'): 'ALGERIA',
 ('i94cnty', '102'): 'ANDORRA',
 ('i94cnty', '324'): 'ANGOLA',
 ('i94cnty', '529'): 'ANGUILLA',
 ('i94cnty', '518'): 'ANTIGUA-BARBUDA',
 ('i94cnty', '687'): 'ARGENTINA ',
 ('i94cnty', '151'): 'ARMENIA',
 ('i94cnty', '532'): 'ARUBA',
 ('i94cnty', '438'): 'AUSTRALIA',
 ('i94cnty', '103'): 'AUSTRIA',
 ('i94cnty', '152'): 'AZERBAIJAN',
 ('i94cnty', '512'): 'BAHAMAS',
 ('i94cnty', '298'): 'BAHRAIN',
 ('i94cnty', '274'): 'BANGLADESH',
 ('i94cnty', '513'): 'BARBADOS',
 ('i94cnty', '104'): 'BELGIUM',
 ('i94cnty', '581'): 'BELIZE',
 ('i94cnty', '386'): 'BENIN',
 ('i94cnty', '509'): 'BERMUDA',
 ('i94cnty', 