# Classification

In [23]:
import keras
import tensorflow as tf
import numpy as np
from keras import layers

import json
import pandas as pd

## Pre-Processing

In [24]:
class Datasets:
    def __init__(self, trainingData, testingData, validationData):
        self.trainingData = trainingData
        self.testingData = testingData
        self.validationData = validationData
        
        self.sqlTemplateToIds = self.GetUniqueSqlTemplatesFromTrainingSet()
        self.AddSqlIdColumn(self.trainingData)
        
    
    def GetUniqueSqlTemplatesFromTrainingSet(self):
        uniqueTemplates = {}
        i = 0
        for datapoint in self.trainingData:
            template = datapoint['sql-template']
            if template not in uniqueTemplates:
                uniqueTemplates[template] = i
                i+=1
                
        return uniqueTemplates
            
    def AddSqlIdColumn(self, dataset):
        for datapoint in dataset:
            sqlTemplate = datapoint["sql-template"]
            datapoint['sql-id'] = self.sqlTemplateToIds[sqlTemplate]

In [31]:
def GetDatasets(fullDatasetFileLocation: str):
    with open(fullDatasetFileLocation, 'r', encoding='utf-8') as f:
        fullDataset = json.load(f)
        
    variableIdDictionary = GetVariableIdDictionary(fullDataset)
    formattedDataset = StripAndFormat(fullDataset)
    
    QueryDatasets = GetSplit(formattedDataset, 'query-split')
    QuestionDatasets = GetSplit(formattedDataset, 'question-split')
    
    return QueryDatasets, QuestionDatasets

def GetVariableIdDictionary(fullDataset: list):
    uniqueVariables = {}
    
    i = 0
    for datapoint in fullDataset:
        for sentence in datapoint['sentences']:
            for variableName in sentence['variables'].keys():
                if variableName not in uniqueVariables:
                    uniqueVariables[variableName] = i
                    i+=1

    return uniqueVariables

def StripAndFormat(dataset):
    formattedDataset = []
    ParseSentences(dataset)
    
    for datapoint in dataset:
        sqlTemplate = min(datapoint["sql"])
        for sentence in datapoint['sentences']:
            formattedDataset.append( {
                'text': sentence['text'],
                'full-text': sentence['full-text'],
                'sql-template': sqlTemplate,
                'tags': sentence['tags'],
                'query-split': datapoint['query-split'],
                'question-split': sentence['question-split']
                })
    
    return formattedDataset
    
def ParseSentences(dataset):
    for datapoint in dataset:
        for sentence in datapoint['sentences']:
            AddFullTextAndTags(sentence)
            
            
def AddFullTextAndTags(sentence):
    tags = []
    fullText = []
    words = sentence['text'].split()
    for word in words:
        if word in sentence['variables']:
            tags.append(word)
            fullText.append(sentence['variables'][word])
        else:
            tags.append('O')
            fullText.append(word)
            
    sentence['tags'] = tags
    sentence['full-text'] = fullText

def GetSplit(dataset, key):
    trainingData = []
    testingData = []
    validationData = []
    
    for datapoint in dataset:
        split = datapoint[key]
        match split:
            case "train":
                trainingData.append(datapoint)
            case "dev":
                validationData.append(datapoint)
            case "test":
                testingData.append(datapoint)
                
    return Datasets(trainingData, testingData, validationData)

In [32]:
QueryDatasets, QuestionDatasets = GetDatasets("sources/atis.json")

print(QueryDatasets.trainingData[120]['text'])
print(QueryDatasets.trainingData[120]['tags'])

please list all available flights from city_name1 california to city_name0 on wednesday
['O', 'O', 'O', 'O', 'O', 'O', 'city_name1', 'O', 'O', 'city_name0', 'O', 'O']


In [None]:
vectorize_layer = keras.layers.TextVectorization(
    standardize=custom_standardization,
    max_tokens=max_features,
    output_mode="int",
    output_sequence_length=sequence_length,
)

## Feed Forward

## LSTM

## Transformer