In [1]:
import numpy as np
from collections import defaultdict
import copy
from collections import Counter

In [2]:
# Parse the data: ONLY NEED TO WORK ON THIS: REST IS DONE!!
# what to keep inside: [[id, [token], [lemmatized_token], [indices of tokens/lemmatized tokens], [POS Tags], [head_indices], [modifier]]]
def parseData(filePath):
    data = []
    
    with open(filePath, 'r') as file:
        sentId = ""
        
        for line in file:
            line = line.strip()
            if line.startswith("# sent_id"):
                newLine = line.split("=")
                newSentence = newLine[-1].strip()
                sentId = newSentence
                
            elif line and not line.startswith("#"):
                divisions = line.split()
                if len(divisions) >= 5:
                    data.append([sentId, divisions[0], divisions[1], divisions[2], divisions[3], divisions[4], divisions[5]]) 
                    
    return data

In [3]:
def groupData(data):
    dicts = {}

    for sublist in data:
        key = sublist[0]
        if key not in dicts:
            dicts[key] = [[] for _ in range(len(sublist)-1)]
        for i in range(1, len(sublist)):
            if sublist[i].isdigit():
                dicts[key][i-1].append(int(sublist[i]))
            elif all(char.isdigit() for char in sublist[i]):
                dicts[key][i-1].append(float(sublist[i]))
            else:
                dicts[key][i-1].append(sublist[i])

    result = []
    for key, values in dicts.items():
        result.append([key] + values)

    return result, dicts

In [4]:
# For each sentence in the array it is a dictionary of mappings from numbers to words
def getLineNumberMappings(groupedData):
    result = []
    for group in groupedData:
        groupDict = {}
        keys = group[1]
        values = group[3]
        for i in range(len(keys)):
            groupDict[keys[i]] = str(values[i])
        result.append(groupDict)
    return result

# Lets make the initial configuration
def getInitialConfig(data):
    configLists = []
    for i in range(len(data)):
        newArr = data[i][3]
        updatedArr = []
        for element in newArr:
            updatedArr.append(str(element))
        configLists.append(([], updatedArr, []))
    return configLists

# Dependency tree extra is a 2D array of (head, dep, depTag)
# Dep Tree is a 2D array of (head, dep)
def getDependencyTree(groupedData, getWordsAtIndices):
    resE = []
    res = []
    
    for i in range(len(groupedData)):
        depIndices = groupedData[i][1]
        headIndices = groupedData[i][5]
        modIndices = groupedData[i][6]
        dicts = getWordsAtIndices[i]
        newArr = []
        newArrE = []
        
        for j in range(min(len(depIndices), len(headIndices), len(modIndices))):
            getDepWord = dicts[depIndices[j]]
            getHeadWord = "*"
            
            if headIndices[j] != 0:
                getHeadWord = dicts[headIndices[j]]
            
#             if getHeadWord != "*":
            newArrE.append((getHeadWord, getDepWord, modIndices[j]))
            newArr.append((getHeadWord,getDepWord))
        
        res.append(newArr)
        resE.append(newArrE)
        
    return res, resE

In [5]:
def getPOS(groupedData):
    result = []
    for group in groupedData:
        groupDict = {}
        keys = group[3]
        values = group[4]
        for i in range(len(keys)):
            val = str(keys[i])
            groupDict[val] = values[i]
        result.append(groupDict)
    return result

def getDepT(groupedData):
    result = []
    for group in groupedData:
        groupDict = {}
        keys = group[3]
        values = group[6]
        for i in range(len(keys)):
            val = str(keys[i])
            groupDict[val] = values[i]
        result.append(groupDict)
    return result

def getFrequencyDictionaries(trainingData):
    allWords = []
    allPos = []
    allDeps = []
    words = defaultdict(int)
    posTags = {}
    depTags = {}

    for i in range(len(trainingData)):
        value1 = str(trainingData[i][3])
        value2 = str(trainingData[i][4])
        value3 = str(trainingData[i][6])
        
        allWords.append(value1)

        if value2 not in allPos:
            allPos.append(value2)

        if value3 not in allDeps:
            allDeps.append(value3)

    wordFreqs = Counter(allWords)
    mostCommonWords = wordFreqs.most_common(1005)
    uniqueWords = [word for word, freq in mostCommonWords if freq < 1000]

    for i in range(len(uniqueWords)):
        words[uniqueWords[i]] = i
        
    for i in range(len(allPos)):
        posTags[allPos[i]] = i
        
    for i in range(len(allDeps)):
        depTags[allDeps[i]] = i

    return words, posTags, depTags

In [6]:
def performAction(configuration, action):
    stack, buffer, arcs = configuration   
    topS = stack[-1] if stack else None
    firstB = buffer[0] if buffer else None
    
    if action == "LEFT-ARC":
        arcs.append((firstB, topS))
        stack.pop()
    elif action == "RIGHT-ARC":
        arcs.append((topS, firstB))
        stack.append(firstB)
        buffer.pop(0)
    elif action == "REDUCE":
        stack.pop()
    elif action == "SHIFT":
        buffer.pop(0)
        stack.append(firstB)
        
    return (stack, buffer, arcs)

In [7]:
def oracle(configuration, DT):
    stack, buffer, arcs = configuration
    
    topS = stack[-1] if stack else None
    firstB = buffer[0] if buffer else None

    if firstB and topS and (firstB, topS) in DT and topS not in [D for H, D in arcs]:
        return "LEFT-ARC"
    elif topS and firstB and (topS, firstB) in DT:
        return "RIGHT-ARC"
    elif topS and firstB and topS in [D for H, D in arcs]:
        for word in reversed(stack[:-1]):
            if (word, firstB) in DT or (firstB, word) in DT:
                return "REDUCE"
    return "SHIFT"

In [8]:
def getBOW (word, getIndex):
    if word is None:
        return [0] * len(getIndex)
    elif word not in getIndex:
        return [0] * len(getIndex)
    else:
        index = getIndex[word]
        listOfStuff = [0]*len(getIndex)
        if index >=0 and index < len(getIndex):
            listOfStuff[index] = 1
        return listOfStuff

In [9]:
def getFeatureModel(configuration, index, pos, depT):
    stack, buffer, arcs = configuration
    
    topS = None
    firstB = None
    posTagTopS = None
    posTagFirstB = None
    posTagSecondB = None
    
    if stack:
        topS = stack[-1]
        posTagTopS = pos[index][topS]
    if buffer:
        firstB = buffer[0]
        posTagFirstB = pos[index][firstB]
        if len(buffer) >= 2:
            posTagSecondB = pos[index][buffer[1]]
    
    topDep = None
    topLDep = None
    topRDep = None
    firstLDep = None
    
    for head, depTag in arcs:
        if topS and head == topS:
            topDep = depTag
            if not topLDep:
                topLDep = depTag
            topRDep = depTag
        
        if firstB and head == firstB:
            if not firstLDep:
                firstLDep = depTag
                
    topDep = depT[index][topDep] if topDep and depT[index][topDep] else None
    topLDep = depT[index][topLDep] if topLDep and depT[index][topLDep] else None
    topRDep = depT[index][topRDep] if topRDep and depT[index][topRDep] else None
    firstLDep = depT[index][firstLDep] if firstLDep and depT[index][firstLDep] else None
    
    tops = getBOW(topS, words)
    first = getBOW(firstB, words)
    posTop = getBOW(posTagTopS, posTags)
    posFirst = getBOW(posTagFirstB, posTags)
    posSecond = getBOW(posTagSecondB, posTags)
    topDeps = getBOW(topDep, depTags)
    topLDeps = getBOW(topLDep, depTags)
    topRDeps = getBOW(topRDep, depTags)
    firstLDeps = getBOW(firstLDep, depTags)
    
    fM =  tops + first + posTop + posFirst + posSecond + topDeps + topLDeps + topRDeps + firstLDeps
    
    return fM

In [10]:
getAction = ["LEFT-ARC", "RIGHT-ARC", "REDUCE", "SHIFT"]
transitionAction = {
    "LEFT-ARC": 0, "RIGHT-ARC": 1, "REDUCE": 2, "SHIFT": 3
}

In [11]:
def performComputation(fM, action):
    v = len(fM)
    result = np.zeros(4 * v)
    result[transitionAction[action]*v : (transitionAction[action] + 1)*v] +=fM

    return result   

In [12]:
Results = []
uncleanedTrainingData = parseData("./NLP2/train.txt")
trainingData = [sublist for sublist in uncleanedTrainingData if sublist[1].isdigit()]

groupedTrainingData, dictionaryOfTrainingData = groupData(trainingData)

listOfConfigs = getInitialConfig(groupedTrainingData)
getWordsAtIndices = getLineNumberMappings(groupedTrainingData)
DT, DTE = getDependencyTree(groupedTrainingData, getWordsAtIndices)

pos = getPOS(groupedTrainingData)
depT = getDepT(groupedTrainingData)
words, posTags, depTags = getFrequencyDictionaries(trainingData)

def learnTheWeights(listOfConfigurations, maxIters, DT, pos, depT):
    v = (2*len(words) + 3*len(posTags) + 4*len(depTags))
    w = np.random.normal(0, 1, 4*v)
    for i in range(maxIters):
        for j in range(len(listOfConfigurations)):
#             print(j)
            c = [copy.deepcopy(tup) for tup in listOfConfigurations[j]]
            stack, buffer, arc = c
            maxValue = -9999
            trainingAction = "LEFT-ARC"
            bestFeature = np.array([])
            
            while buffer:
                fM = getFeatureModel((stack, buffer, arc), j, pos, depT)
                fM = np.array(fM)
                for action in getAction:
                    newFeatureVector = performComputation(fM, action)
                    value = np.dot(newFeatureVector, w)

                    if value > maxValue:
                        maxValue = value
                        trainingAction = action
                        bestFeature = newFeatureVector
                
                oracleAction = oracle(c, DT[j])
                
                if trainingAction != oracleAction:
                    w = w + performComputation(fM, oracleAction) - bestFeature
                stack, buffer, arc = performAction(c, oracleAction)
            Results.append(arc)
        
    return w     

In [13]:
weights = learnTheWeights(listOfConfigs, 10, DT, pos, depT)

In [14]:
print(weights)
print(Results[0])

[-3.15063549  2.00199092 -4.72718486 ...  8.06858242 13.29847703
  9.7357652 ]
[('appreciation', 'aesthetic'), ('art', 'Spanish'), ('art', 'and'), ('appreciation', 'art'), ('appreciation', ':')]


In [20]:
# This should return a valid action
def checkIfMoveIsLegal(c, testAction):
    stack, buffer, arcs = c
    if testAction == "SHIFT":
        return "SHIFT"
    elif testAction == "RIGHT-ARC":
        if stack:
            return "RIGHT-ARC"
        return "SHIFT"
    elif testAction == "LEFT-ARC":
        if len(stack) == 0:
            return "SHIFT"
        elif stack[-1] in [D for H, D in arcs]:
            return "REDUCE"
        else:
            return "LEFT-ARC"
    else:
        if len(stack) == 0:
            return "SHIFT"
        elif stack[-1] not in [D for H, D in arcs]:
            return "LEFT-ARC"
        else:
            return "REDUCE"

In [29]:
def testDataFunction(listOfConfigurations, DT, weights, posTest, depTest):
    testResults = []
    for j in range(len(listOfConfigurations)):
        c = [copy.deepcopy(tup) for tup in listOfConfigurations[j]]
        stack, buffer, arc = c
        testAction = "LEFT-ARC"
        bestFeature = np.array([])

        while buffer:
            fM = getFeatureModel((stack, buffer, arc), j, posTest, depTest)
            fM = np.array(fM)
            
            maxValue = -99999
            bestFeature = np.array([])
            
            for action in getAction:
                newFeatureVector = performComputation(fM, action)
                value = np.dot(newFeatureVector, weights)

                if value > maxValue:
                    maxValue = value
                    testAction = action
                    bestFeature = newFeatureVector
            
            # Illegal check
            updatedAction = checkIfMoveIsLegal(c, testAction)
            stack, buffer, arc = performAction(c, updatedAction)
            
        testResults.append(arc)
        
    return testResults

In [38]:
uncleanedTestData = parseData("./NLP2/test.txt")
testData = [sublist for sublist in uncleanedTestData if sublist[1].isdigit()]

groupedTestData, dictionaryOfTestData = groupData(testData)
listOfTestConfigs = getInitialConfig(groupedTestData)
getTestWordsAtIndices = getLineNumberMappings(groupedTestData)
DTTest, _ = getDependencyTree(groupedTestData, getTestWordsAtIndices)

posTest = getPOS(groupedTestData)
depTest = getDepT(groupedTestData)

# Call the test function and get the resultant array
testResults = testDataFunction(listOfTestConfigs, DTTest, weights, posTest, depTest)
print(testResults)

[[('prevalence', 'the'), ('discrimination', 'of'), ('discrimination', 'prevalence'), ('group', 'racial'), ('group', 'across'), ('America', 'contemporary'), ('America', 'in'), ('America', 'group'), (':', 'America'), (':', 'discrimination')], [('from', 'result'), ('sample', 'representative'), ('sample', 'nationally'), ('adult', 'of')], [('introduction', '.')], [('experience', 'personal'), ('discrimination', 'of'), ('discrimination', 'experience'), ('and', 'discrimination'), ('bias', 'and'), ('be', 'have'), ('be', 'bias'), ('the', 'be'), ('focus', 'the'), ('science', 'social'), ('research', 'science'), ('research', 'much'), ('focus', 'research'), ('focus', '.'), ('1', '['), ('3', '-'), ('3', '1'), ('3', ']')], [('have', 'sociologist'), ('the', 'explore'), ('consequence', 'adverse'), ('discrimination', 'of'), ('discrimination', 'consequence'), ('the', 'discrimination'), ('3', '['), ('5', '-'), ('5', '3'), ('5', ']'), (';', '5'), (';', 'the'), (';', 'have')], [('have', 'psychologist'), ('pr