#### Author: Samia Zaman
#### Date: 25 Feb, 2021
### CS 334 Project 1: Building an interpreter for CATANDMOUSE programming language- Part 1

In [1]:
'''Return a boolean indicating whether given word was a valid token or not
    Arguments: 
    word - a string that is only a valid variable if it is comprised of numbers and digits
    digits - a list of strings in: 123456789
    letters - a list of strings in: abcdefg....xyz'''

def valid_variable(word, digits, letters):
    word = list(word)
    digits_in_word = set(word).intersection(set(digits))
    letters_in_word = set(word).intersection(set(letters))
    if (digits_in_word.union(letters_in_word) != set(word)):
        return False                  # foreign symbol outside letters and digits
    if (digits_in_word == set(word)):
        if len(word)<4:        # all digits but length less than 4
            return False
    return True               # otherwise, valid - mix of letters and digits

In [2]:
'''Return a boolean indicating whether given word was a valid token or not
    Arguments: 
    word - a string that is only a valid variable if it is comprised of numbers and digits
    digits - a list of strings in: 123456789'''

def valid_int(word, digits):
    try:
        int(word)
    except ValueError:
        return False
    else:
        if word.startswith("0"):
            if len(word) >1:
                return False
        if len(word)>3:
            return False
        return True

In [3]:
'''Return an integer index where '//' starts - indicating the start of a comment
Arguments:
    wordsList - a list of words (strings)'''
def comment_starts(wordsList):
    return wordsList.index('//')

In [4]:
'''Return a boolean indicating whether '//' present in line or not
Arguments: 
    wordsList - a list of words (strings)'''
def has_comment(wordsList):
    if '//' in wordsList:
        return True
    return False

In [5]:
'''Return a tuple of strings indicating the token type and the value of the word
Arguments: 
    word - a string which has a particular "token_type" 
    digits - a list of strings in: 123456789
    letters - a list of strings in: abcdefg....xyz
    keywords - a list of key words
    punctuations - just [;], for now '''

def get_token_type(word, digits, letters, keywords, 
              punctuations):
    if ((word in keywords) or (word in punctuations)):
        return (word, "NULL")
    if valid_int(word, digits):
        return ("integer", word)
    if valid_variable(word, digits, letters):
        return ("variable", '0')
    else:
        return ("error", '0')
    

In [6]:
'''Search for variable or integer already in table - otherwise insert integer/variable not seen before
Arguments: 
    word - a string with token_type = variable or integer
    value - a string with value = 0 if variable, and value of int if integer
    symTable - add to it 3-element lists of the form: [integer/variable, char value, int value]  '''

def search_and_insert(word, token_type, value, symTable):
    if (value == "NULL"):
        return []
    integers_found = [symTable[k][1] for k in range(len(symTable)) if symTable[k][0] == "integer"]  # value is in position 2
    variables_found = [symTable[k][1] for k in range(len(symTable)) if symTable[k][0] == "variable"] 

    if token_type == "integer":
        if (int(value) in integers_found):
            index = integers_found.index(int(value))
            return [index, 0]
        else:
            symTable.append(["integer", int(value), int(value)])
            return[len(symTable), 0]
        
    elif (token_type == "variable"):
        if (word in variables_found):
            index = variables_found.index(word)
            return [index, 0]
        else:
            symTable.append(["variable", word, int(value)])
            return[len(symTable), 0]
    
                

In [13]:
'''Main function for reading in the file specified, and printing whether each word in file is one of:
    1. A valid token - if so, print the token type and the value associated
    2. Invalid token - then print an error message specifying the line where the invalid token is
Additionally, 
    3. It stores valid, unique tokens into a data structure called symTable = 'symbol table'
    4. Returns the location ([row, column]) of the valid token in symbol table
Arguments:
    filename - file to be read
    symTable - initially an empty list;
    to be filled with (1x3) arrays of unique token_type followed by their int and char values'''

def scanner(filename, symTable):
    file = open(filename, "r")
    line_index = 0
    for line in file:
        words_in_line = line.strip().split(" ")
        if (words_in_line[0] == ''):
            continue
        if has_comment(words_in_line):
            comment_index = comment_starts(words_in_line)
            words_in_line = words_in_line[0:comment_index]
        line_index +=1
        print(words_in_line)
        for word in words_in_line:
            word = word.lower()
            (token_type, value) = get_token_type(word, digits, letters, keywords, punctuations)
            if token_type == "error":
                print("Error at line: " + str(line_index) + " on invalid token: '" + word +"'\n")
                pass
            else:
                print("Token is: " + word + ", with token type: " + token_type  + ", and value: " + value + "\n")
            #location = search_and_insert(word, token_type, value, symTable) # Throw this information away for now
            #print("Location for this token in symbol table is [row, column]: ", location, "\n") 
            
    return symTable 
            

In [14]:
# Set up the default inputs to driver function
# Defaults are specific to those specified in CATANDMOUSE programming language

filepath = ""
file_extension = ".txt"              # my windows computer only downloads .mc files as .mc.txt- e.g. p2e.mc.txt
symTable = []
keywords = ["begin", "halt","cat", "mouse", "clockwise", "move", "north", "south", "east", "west", "hole", 
          "repeat", "size", "end"]
punctuations = [";"]               # only ; for now
letters = list("abcdefghijklmnopqrstuvwxyz")
digits = list("0123456789")

### Run the driver function, 'scanner'

In [15]:
## call scanner to start processing the file
filename = input("Please enter a filename, (e.g. p1.mc/ p2e.mc): ")
filename = filename + file_extension
scanner(filename, symTable)

Please enter a filename, (e.g. p1.mc/ p2e.mc): p2e.mc
['cat', 'charlotte', '20', '21', 'east', ';']
Token is: cat, with token type: cat, and value: NULL

Token is: charlotte, with token type: variable, and value: 0

Token is: 20, with token type: integer, and value: 20

Token is: 21, with token type: integer, and value: 21

Token is: east, with token type: east, and value: NULL

Token is: ;, with token type: ;, and value: NULL

['mouse', '98', '5', '6', 'north', ';']
Token is: mouse, with token type: mouse, and value: NULL

Token is: 98, with token type: integer, and value: 98

Token is: 5, with token type: integer, and value: 5

Token is: 6, with token type: integer, and value: 6

Token is: north, with token type: north, and value: NULL

Token is: ;, with token type: ;, and value: NULL

['hole', '5874', '8;', '', '', '', '', '', '', '', '']
Token is: hole, with token type: hole, and value: NULL

Token is: 5874, with token type: variable, and value: 0

Error at line: 3 on invalid token

[]

In [19]:
from tabulate import tabulate
print(tabulate(symTable, headers=["TYPE","CH VALUE", "INT VALUE"]))

TYPE      CH VALUE                                            INT VALUE
--------  ------------------------------------------------  -----------
integer   1                                                           1
integer   2                                                           2
integer   3                                                           3
integer   12                                                         12
integer   23                                                         23
integer   123                                                       123
variable  1234                                                        0
variable  12345                                                       0
variable  1234878798127982739182791239128739                          0
variable  ajasjssjsdfsfkj35jk5k5jk5k5kj5lk2k234lj234k342lj            0
variable  58u                                                         0
variable  a4                                                    

#### THANK YOU !