In [15]:
import numpy as np
from collections import defaultdict
from collections import Counter

In [3]:
# Parse the data: ONLY NEED TO WORK ON THIS: REST IS DONE!!
# what to keep inside: [[id, [token], [lemmatized_token], [indices of tokens/lemmatized tokens], [POS Tags], [head_indices], [modifier]]]
def parseData(filePath):
    data = []
    
    with open(filePath, 'r') as file:
        sentId = ""
        
        for line in file:
            line = line.strip()
            if line.startswith("# sent_id"):
                newLine = line.split("=")
                newSentence = newLine[-1].strip()
                sentId = newSentence
                
            elif line and not line.startswith("#"):
                divisions = line.split()
                if len(divisions) >= 5:
                    data.append([sentId, divisions[0], divisions[1], divisions[2], divisions[3], divisions[4], divisions[5]]) 
                    
    return data


uncleanedTrainingData = parseData("./NLP2/train.txt")
trainingData = [sublist for sublist in uncleanedTrainingData if sublist[1].isdigit()]
print(trainingData)



In [4]:
def groupData(data):
    dicts = {}

    for sublist in data:
        key = sublist[0]
        if key not in dicts:
            dicts[key] = [[] for _ in range(len(sublist)-1)]
        for i in range(1, len(sublist)):
            if sublist[i].isdigit():
                dicts[key][i-1].append(int(sublist[i]))
            elif all(char.isdigit() for char in sublist[i]):
                dicts[key][i-1].append(float(sublist[i]))
            else:
                dicts[key][i-1].append(sublist[i])

    result = []
    for key, values in dicts.items():
        result.append([key] + values)

    return result, dicts

groupedTrainingData, dictionaryOfTrainingData = groupData(trainingData)
print(groupedTrainingData[:10])

[['GUM_academic_art-1', [1, 2, 3, 4, 5, 6], ['Aesthetic', 'Appreciation', 'and', 'Spanish', 'Art', ':'], ['aesthetic', 'appreciation', 'and', 'Spanish', 'art', ':'], ['JJ', 'NN', 'CC', 'JJ', 'NN', ':'], [2, 0, 5, 5, 2, 2], ['amod', 'root', 'cc', 'amod', 'conj', 'punct']], ['GUM_academic_art-2', [1, 2, 3, 4, 5], ['Insights', 'from', 'Eye', '-', 'Tracking'], ['insight', 'from', 'eye', '-', 'tracking'], ['NNS', 'IN', 'NN', 'HYPH', 'NN'], [0, 5, 5, 3, 1], ['root', 'case', 'compound', 'punct', 'nmod']], ['GUM_academic_art-3', [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11], ['Claire', 'Bailey', '-', 'Ross', 'claire.bailey-ross@port.ac.uk', 'University', 'of', 'Portsmouth', ',', 'United', 'Kingdom'], ['Claire', 'Bailey', '-', 'Ross', 'claire.bailey-ross@port.ac.uk', 'University', 'of', 'Portsmouth', ',', 'Unite', 'Kingdom'], ['NNP', 'NNP', 'HYPH', 'NNP', 'NNP', 'NNP', 'IN', 'NNP', ',', 'NNP', 'NNP'], [0, 1, 4, 2, 1, 1, 8, 6, 11, 11, 1], ['root', 'flat', 'punct', 'flat', 'list', 'list', 'case', 'nmod', '

In [10]:
# For each sentence in the array it is a dictionary of mappings from numbers to words
def getLineNumberMappings(groupedData):
    result = []
    for group in groupedData:
        groupDict = {}
        keys = group[1]
        values = group[3]
        for i in range(len(keys)):
            groupDict[keys[i]] = values[i]
        result.append(groupDict)
    return result

# Lets make the initial configuration
def getInitialConfig(data):
    configLists = []
    for i in range(len(data)):
        newArr = data[i][3]
        configLists.append(([], newArr, []))
    return configLists

# Dependency tree extra is a 2D array of (head, dep, depTag)
# Dep Tree is a 2D array of (head, dep)
def getDependencyTree(groupedData, getWordsAtIndices):
    resE = []
    res = []
    
    for i in range(len(groupedData)):
        depIndices = groupedData[i][1]
        headIndices = groupedData[i][5]
        modIndices = groupedData[i][6]
        dicts = getWordsAtIndices[i]
        newArr = []
        newArrE = []
        
        for j in range(min(len(depIndices), len(headIndices), len(modIndices))):
            getDepWord = dicts[depIndices[j]]
            getHeadWord = "*"
            
            if headIndices[j] != 0:
                getHeadWord = dicts[headIndices[j]]

            newArrE.append((getDepWord, getHeadWord, modIndices[j]))
            newArr.append((getDepWord, getHeadWord))
        
        res.append(newArr)
        resE.append(newArrE)
        
    return res, resE

listOfConfigs = getInitialConfig(groupedTrainingData)
getWordsAtIndices = getLineNumberMappings(groupedTrainingData)
DT, DTE = getDependencyTree(groupedTrainingData, getWordsAtIndices)
print(DTE)



In [6]:
def getPOS(groupedData):
    result = []
    for group in groupedData:
        groupDict = {}
        keys = group[3]
        values = group[4]
        for i in range(len(keys)):
            groupDict[keys[i]] = values[i]
        result.append(groupDict)
    return result

def getDepT(groupedData):
    result = []
    for group in groupedData:
        groupDict = {}
        keys = group[3]
        values = group[6]
        for i in range(len(keys)):
            groupDict[keys[i]] = values[i]
        result.append(groupDict)
    return result

pos = getPOS(groupedTrainingData)
depT = getDepT(groupedTrainingData)

In [26]:
# words : dictionary of indices of all unique words to be used in the problem
# posTags : dictionary of indices of all unique pos tages to be used in this problem
# depTags : dictionary of indices of all unique posTags to be used in this problem

def getFrequencyDictionaries(trainingData):
    allWords = []
    allPos = []
    allDeps = []
    words = defaultdict(int)
    posTags = {}
    depTags = {}

    for i in range(len(trainingData)):
        allWords.append(trainingData[i][3])
        
        if trainingData[i][4] not in allPos:
            allPos.append(trainingData[i][4])

        if trainingData[i][6] not in allDeps:
            allDeps.append(trainingData[i][6])
            
    wordFreqs = Counter(allWords)
    mostCommonWords = wordFreqs.most_common(1005)
    uniqueWords = [word for word, freq in mostCommonWords if freq < 1000]

    for i in range(len(uniqueWords)):
        words[uniqueWords[i]] = i
        
    for i in range(len(allPos)):
        posTags[allPos[i]] = i
        
    for i in range(len(allDeps)):
        depTags[allDeps[i]] = i

    return words, posTags, depTags

words, posTags, depTags = getFrequencyDictionaries(trainingData)
print(len(words))
print(words)

1000
defaultdict(<class 'int'>, {'and': 0, 'in': 1, 'a': 2, 'to': 3, 'that': 4, '-': 5, 'he': 6, 'this': 7, 'have': 8, '[': 9, ']': 10, '(': 11, ')': 12, 'for': 13, 'it': 14, "''": 15, 'I': 16, 'on': 17, 'his': 18, 'as': 19, 'you': 20, 'at': 21, 'by': 22, 'not': 23, 'with': 24, 'do': 25, "'s": 26, 'we': 27, '?': 28, 'from': 29, 'she': 30, '—': 31, 'they': 32, 'which': 33, 'study': 34, 'know': 35, 'or': 36, 'would': 37, 'but': 38, 'her': 39, 'can': 40, 'well': 41, 'one': 42, ':': 43, 'there': 44, 'will': 45, 'use': 46, 'what': 47, 'first': 48, 'go': 49, ';': 50, 'oh': 51, 'all': 52, 'so': 53, 'uh': 54, 'like': 55, 'their': 56, 'two': 57, 'into': 58, 'make': 59, 'other': 60, 'when': 61, 'more': 62, 'some': 63, 'then': 64, 'who': 65, 'after': 66, 'yeah': 67, 'time': 68, 'take': 69, 'year': 70, 'its': 71, 'also': 72, 'if': 73, 'work': 74, 'good': 75, 'system': 76, 'research': 77, 'such': 78, 'here': 79, 'only': 80, 'language': 81, 'about': 82, '2': 83, 'how': 84, 'no': 85, 'now': 86, 'our'

In [18]:
my_array = [1, 2, 3, "four", 5]

# Check if array is empty
if not my_array:
    print("Array is empty")
else:
    print("Array is not empty")

    # Convert non-string elements to strings
    my_array = [str(item) for item in my_array]

print("Array contents (all converted to strings):", my_array)

Array is not empty
Array contents (all converted to strings): ['1', '2', '3', 'four', '5']


In [12]:
import numpy as np

# Set the size of the list
v = 5  # Example value, replace it with your desired value

# Generate normally distributed random values
mean = 0  # Mean of the distribution
std_dev = 1  # Standard deviation of the distribution
size = 4 * v  # Size of the list

# Generate the list of normally distributed random values
random_values = np.random.normal(0, 1, size)

print("Random values:", random_values)


Random values: [ 0.44168781 -0.25328605  0.39067379 -0.25582069  0.88243781  0.40715537
 -1.45303591  0.94137556 -0.98574819  0.72988402  0.6572581  -0.13920064
  0.90908152 -0.05087172  1.41430215 -1.28300332 -0.14578565  0.9252518
 -1.53257808  1.04263041]
