# Classification

In [55]:
import keras
import tensorflow as tf
import numpy as np
from keras import layers

import json
import pandas as pd

## Pre-Processing

In [None]:
class Datasets:
    def __init__(self, trainingData, testingData, validationData):
        self.sqlTemplateToIds = self.GetUniqueSqlTemplatesFromTrainingSet(trainingData)
        self.AddSqlIdColumn(trainingData)
        
        self.trainingData = pd.DataFrame(trainingData)
        self.testingData = pd.DataFrame(testingData)
        self.validationData = pd.DataFrame(validationData)
        
    
    def GetUniqueSqlTemplatesFromTrainingSet(self, trainingData):
        uniqueTemplates = {}
        i = 0
        for datapoint in trainingData:
            template = datapoint['sql-template']
            if template not in uniqueTemplates:
                uniqueTemplates[template] = i
                i+=1
                
        return uniqueTemplates
            
    def AddSqlIdColumn(self, dataset):
        for datapoint in dataset:
            sqlTemplate = datapoint["sql-template"]
            datapoint['sql-id'] = self.sqlTemplateToIds[sqlTemplate]

In [57]:
def GetVariableIdDictionary(fullDatasetFileLocation):
    with open(fullDatasetFileLocation, 'r', encoding='utf-8') as f:
        fullDataset = json.load(f)
    
    uniqueVariables = {'O': 0}
    
    i = 1
    for datapoint in fullDataset:
        for sentence in datapoint['sentences']:
            for variableName in sentence['variables'].keys():
                if variableName not in uniqueVariables:
                    uniqueVariables[variableName] = i
                    i+=1

    return uniqueVariables

def GetDatasets(fullDatasetFileLocation: str):
    with open(fullDatasetFileLocation, 'r', encoding='utf-8') as f:
        fullDataset = json.load(f)
    
    formattedDataset = StripAndFormat(fullDataset)
    
    QueryDatasets = GetSplit(formattedDataset, 'query-split')
    QuestionDatasets = GetSplit(formattedDataset, 'question-split')
    
    return QueryDatasets, QuestionDatasets

def StripAndFormat(dataset):
    formattedDataset = []
    ParseSentences(dataset)
    
    for datapoint in dataset:
        sqlTemplate = min(datapoint["sql"])
        for sentence in datapoint['sentences']:
            formattedDataset.append( {
                'text': sentence['text'],
                'full-text': sentence['full-text'],
                'sql-template': sqlTemplate,
                'tags': sentence['tags'],
                'query-split': datapoint['query-split'],
                'question-split': sentence['question-split']
                })
    
    return formattedDataset
    
def ParseSentences(dataset):
    for datapoint in dataset:
        for sentence in datapoint['sentences']:
            AddFullTextAndTags(sentence)
            
            
def AddFullTextAndTags(sentence):
    tags = []
    fullText = []
    words = sentence['text'].split(" ")
    for word in words:
        if word in sentence['variables']:
            replacementWords = sentence['variables'][word]
            fullText.append(replacementWords)
            wordList = replacementWords.split(" ")
            for replacementWord in wordList:
                tags.append(word)
        else:
            tags.append('O')
            fullText.append(word)
            
    sentence['tags'] = tags
    sentence['full-text'] = " ".join(fullText)

def GetSplit(dataset, key):
    trainingData = []
    testingData = []
    validationData = []
    
    for datapoint in dataset:
        split = datapoint[key]
        match split:
            case "train":
                trainingData.append(datapoint)
            case "dev":
                validationData.append(datapoint)
            case "test":
                testingData.append(datapoint)
                
    return Datasets(trainingData, testingData, validationData)

In [58]:
QueryDatasets, QuestionDatasets = GetDatasets("sources/atis.json")
VariableIdDictionary = GetVariableIdDictionary("sources/atis.json")

## Linear Model

Implementation from my submission for assignment 2 with minor alterations to handle this data

In [59]:
class LinearModel:
    def initialiseWordWeights(self, word):
        for label in self.labels:
            self.weights[(word, label)] = 0

    def __init__(self, labels, training_data):
        """Prepare the class member variables.
        Save the labels in self.labels and initialise all the weights to 0.

        Keyword arguments:
        labels -- a set of strings, each string is one SQL query
        training_data -- a list, each item is a tuple containing a question and an SQL query
        """


        self.labels = labels
        self.weights = {}
        for datapoint in training_data:
            words = datapoint[0].split()
            for word in words:
                self.initialiseWordWeights(word)         
        
    def get_features(self, question, label):
        """Produce a list of features for a specific question and label.
        
        Keyword arguments:
        question -- a string, an English question
        label -- a string, an SQL query
        """
        features = []
        for word in question.split():
            features.append((word, label))
        return features

    def get_score(self, question, label):
        """Calculate the model's score for a question, label pair.
        
        Keyword arguments:
        question -- a string, an English question
        label -- a string, an SQL query
        """
        score = 0
        if label in self.labels:
            for word in question.split():
                if (word, label) in self.weights:
                    score += self.weights[(word,label)]

        return score

    def update(self, question, label, change):
        """Modify the model.
        Changes all weights for features for the (question, SQL query) pair by the amount indicated.

        Keyword arguments:
        question -- a string, an English question
        label -- a string, an SQL query
        change -- an integer, how much to change the weights
        """
        for word in question.split():
            self.weights[(word, label)] += change

def getBestScoringLabel(scoreDict, labels):
    maxScore = -9999999
    maxLabel = ""

    for label in labels:
        if scoreDict[label] > maxScore:
            maxLabel = label
            maxScore = scoreDict[label]

    return maxLabel

# This is the function you need to implement
def find_best_code(question, model):
    """Predicts the SQL for a question by using a model to try all possible labels.

    Keyword arguments:
    question -- a string, the English question
    model -- a CodeModel, as defined in the Model question
    """
    labelScoreDict = {}
    for label in model.labels:
        labelScoreDict[label] = model.get_score(question, label)
    
    return getBestScoringLabel(labelScoreDict, model.labels)


def learn(question, answer, model):
    """Updates a model by predicting the SQL for a question and making a Perceptron update 

    Keyword arguments:
    question -- a string, the English question
    answer -- a string, the correct SQL query for this question 
    model -- a CodeModel, as defined in the Model question
    find_best_code -- a function, the one defined the Inference question
    """
    best_score = find_best_code(question, model)
    if best_score != answer:
        model.update(question, answer, 1)
        model.update(question, best_score, -1)
    
# These are the functions you need to implement
def calculate_accuracy(confusion_matrix, labels):
    """Returns the accuracy based on the contents of a confusion matrix

    Keyword arguments:
    confusion_matrix -- a dictionary, as defined in the Confusion Matrix question
    labels -- a set of strings, all the possible labels
    """
    correctCount = 0
    total = 0
    for label1 in labels:
        for label2 in labels:
            if label1 == label2:
                correctCount += confusion_matrix[(label1, label2)]
            total += confusion_matrix[(label1, label2)]
    
    if total > 0:
        return correctCount / total
    else: 
        return 0
def calculate_precision(confusion_matrix, labels):
    """Returns a dict containing the precision for each label based on the contents of a confusion matrix

    Keyword arguments:
    confusion_matrix -- a dictionary, as defined in the Confusion Matrix question
    labels -- a set of strings, all the possible labels
    """
    precisionDictionary = {}
    for testLabel in labels:
        truePositives = 0
        falsePositives = 0
        for otherLabel in labels:
            if testLabel == otherLabel:
                truePositives += confusion_matrix[(testLabel, testLabel)]
            else:
                falsePositives += confusion_matrix[(otherLabel, testLabel)]
        
        total = truePositives + falsePositives
        if total > 0:
            precisionDictionary[testLabel] = truePositives / (total)
        else:
            precisionDictionary[testLabel] = 0
            

    return precisionDictionary

def calculate_recall(confusion_matrix, labels):
    """Returns a dict containing the recall for each label based on the contents of a confusion matrix

    Keyword arguments:
    confusion_matrix -- a dictionary, as defined in the Confusion Matrix question
    labels -- a set of strings, all the possible labels
    """
    recallDictionary = {}
    for testLabel in labels:
        truePositives = 0
        falseNegatives = 0
        for otherLabel in labels:
            if testLabel == otherLabel:
                truePositives += confusion_matrix[(testLabel, testLabel)]
            else:
                falseNegatives += confusion_matrix[(testLabel, otherLabel)]
        
        total = truePositives + falseNegatives
        if total > 0:
            recallDictionary[testLabel] = truePositives / (truePositives + falseNegatives)
        else:
            recallDictionary[testLabel] = 0

    return recallDictionary

def calculate_macro_f1(confusion_matrix, labels):
    """Returns the Macro F-Score based on the contents of a confusion matrix

    Keyword arguments:
    confusion_matrix -- a dictionary, as defined in the Confusion Matrix question
    labels -- a set of strings, all the possible labels
    """
    precisionMetrics = calculate_precision(confusion_matrix, labels)
    recallMetrics = calculate_recall(confusion_matrix, labels)

    f1Total = 0
    for label in labels:
        precisionScore = precisionMetrics[label]
        recallScore = recallMetrics[label]
        
        denominator = precisionScore + recallScore
        if denominator != 0:
            f1Score = 2 * ((precisionScore * recallScore) / (precisionScore + recallScore))
            f1Total += f1Score

    return f1Total / len(labels)

def initializeConfusionMatrix(labels):
    confusionMatrix = {}
    for label1 in labels:
        for label2 in labels:
            confusionMatrix[(label1, label2)] = 0
            confusionMatrix[('unknown', label2)] = 0
    return confusionMatrix

# This is the function you need to implement
def get_confusion_matrix(eval_data, model):
    """Creates a confusion matrix by predicting the SQL for a question and recording how the answer compares with the true answer 

    Keyword arguments:
    eval_data -- a list of tuples containing the English question and the true SQL query
    model -- a CodeModel, as defined in the Model question
    find_best_code -- a function, the one defined the Inference question
    """
    confusionMatrix = initializeConfusionMatrix(model.labels)
    for datapoint in eval_data:
        trueAnswer = datapoint[1]
        predictedAnswer = find_best_code(datapoint[0], model)
        if (trueAnswer, predictedAnswer) in confusionMatrix:
            confusionMatrix[(trueAnswer, predictedAnswer)] += 1

    return confusionMatrix

def TrainAndEvaluateModel(model, iterations, trainData, validationData, testData):
    """Trains and evaluates a model on some read_data

    Keyword arguments:
    filename -- a string, the location of a json file containing data
    iterations -- an integer, the number of iterations of training to do
    read_data -- a function, as defined in the Data question
    model_maker -- a class, as defined in the Model question
    learn -- a function, as defined in the Learning question
    find_best_code -- a function, as defined in the Inference question
    get_confusion_matrix -- a function, as defined in the Confusion Matrix question
    calculate_accuracy -- a function, as defined in the Evaluation Metrics question
    calculate_macro_f1 -- a function, as defined in the Evaluation Metrics question
    """

    dev_scores = []
    i = 0
    while i < iterations:
        for datapoint in trainData:
            learn(datapoint[0], datapoint[1], model)
        
        confusionMatrix = get_confusion_matrix(validationData, model)
        devScore = {}
        devScore["accuracy"] = calculate_accuracy(confusionMatrix, model.labels)
        devScore["macro-f1"] = calculate_macro_f1(confusionMatrix, model.labels)

        dev_scores.append(devScore)
        i+=1

    confusionMatrix = get_confusion_matrix(testData, model)
    test_score = {}
    test_score["accuracy"] = calculate_accuracy(confusionMatrix, model.labels)
    test_score["macro-f1"] = calculate_macro_f1(confusionMatrix, model.labels)

    return dev_scores, test_score

train models

In [None]:
def getFullWordList(allSentences):
    longSentence = " ".join(allSentences)
    return longSentence.split(" ")

def flatten(xss):
    return [x for xs in xss for x in xs]

def FormatFeatureLabelPairs(features, labels):
    pairs = []
    print(len(features))
    print(len(labels))
    
    i = 0
    while i < len(features):
        pairs.append((features[i], labels[i]))
        i+=1
    return pairs

def GetTagPairs(dataset):
    fullWordList = getFullWordList(dataset['full-text'])
    flattenedTags = flatten(dataset['tags'])
    
    return FormatFeatureLabelPairs(fullWordList, flattenedTags)

def GetSqlLabels(allSentences, allTemplates):
    sqlLabels = []
    
    i = 0
    while i < len(allSentences):
        sentence = allSentences[i]
        words = sentence.split(" ")
        for _word in words:
            sqlLabels.append(allTemplates[i])
        i+=1
        
    return sqlLabels

def GetSqlPairs(dataset):
    fullWordList = getFullWordList(dataset['full-text'])
    sqlLabels = GetSqlLabels(dataset['full-text'], dataset['sql-template'])
    
    return FormatFeatureLabelPairs(fullWordList, sqlLabels)

def EvaluateLinearModels(datasets: Datasets):
    allVariables = VariableIdDictionary.keys()
    

    trainingTagPairs = GetTagPairs(datasets.trainingData)
    trainingSqlPairs = GetSqlPairs(datasets.trainingData)
    
    validationTagPairs = GetTagPairs(datasets.validationData)
    validationSqlPairs = GetSqlPairs(datasets.validationData)
    
    testingTagPairs = GetTagPairs(datasets.testingData)
    testingSqlPairs = GetSqlPairs(datasets.testingData)

    tagModel = LinearModel(allVariables, trainingTagPairs)
    sqlModel = LinearModel(datasets.sqlTemplateToIds.keys(), trainingSqlPairs)

    devScores, tagTestScores = TrainAndEvaluateModel(tagModel, 15, trainingTagPairs, validationTagPairs, testingTagPairs)
    print("tag test scores:")
    print(tagTestScores)
    
    devScores, sqlTestScores = TrainAndEvaluateModel(sqlModel, 15, trainingSqlPairs, validationSqlPairs, testingSqlPairs)
    print("sql test scores:")
    print(sqlTestScores)
    
    return tagModel, sqlModel

queryTagModel, querySqlModel = EvaluateLinearModels(QuestionDatasets)

TrainAndEvaluateModel()

47754
47754
47754
47754
5366
5366
5366
5366
4240
4240
4240
4240
tag test scores:
{'accuracy': 0.8629716981132075, 'macro-f1': 0.2593400714142085}
sql test scores:
{'accuracy': 0.011751326762699014, 'macro-f1': 0.001695960711449945}


TypeError: TrainAndEvaluateModel() missing 5 required positional arguments: 'model', 'iterations', 'trainData', 'validationData', and 'testData'

## Feed Forward

## LSTM

## Transformer