# Vector Store Creation with SQL

In [1]:
import teradataml as tdml
import json

## 1 - Connect to Vantage

In [2]:
print('teradataml version    :',tdml.__version__)

teradataml version    : 20.00.00.05


In [None]:
# Reading JSON data
with open('../connection/connection.json', 'r') as f:
    Param = json.load(f)
    Param['database'] = Param['user']
                
tdml.create_context(**Param)

## 2 - The data

In [4]:
table_name = 'denis_pdfs'
database   = Param['database']

In [5]:
print(tdml.execute_sql(f"SHOW TABLE {database}.{table_name}").fetchall()[0][0].replace('\r','\n'))

CREATE SET TABLE dm250067.denis_pdfs ,FALLBACK ,
     NO BEFORE JOURNAL,
     NO AFTER JOURNAL,
     CHECKSUM = DEFAULT,
     DEFAULT MERGEBLOCKRATIO,
     MAP = TD_MAP2
     (
      TD_ID INTEGER,
      TD_FILENAME VARCHAR(1024) CHARACTER SET LATIN NOT CASESPECIFIC,
      CHUNKS VARCHAR(32000) CHARACTER SET LATIN NOT CASESPECIFIC)
PRIMARY INDEX ( TD_ID );


In [12]:
dataset = tdml.DataFrame(tdml.in_schema(database,table_name))
dataset



TD_ID,TD_FILENAME,CHUNKS
40,2404.16130v1.pdf,"which people inspect, engage with, and contextualize data within the broader scope of real-world activities (Koesten et al., 2021). Similarly, methods for extracting latent summarization queries from source texts also exist (Xu and Lapata, 2021), but such extracted questions can target details that betray prior knowledge of the texts. To evaluate the effectiveness of RAG systems for more global sensemaking tasks, we need questions that convey only a high-level understanding of dataset contents,"
120,2404.03622v3.pdf,"ning empty squares is : B. 3 The final configuration would look like this: Visualize the state after each reasoning step Lets think step by step. (a) Common behaviors of V oT and CoT prompting in visual tiling task, with the overall track rate of 87.1% and 57.4% respectively. To navigate from to , we need to find the path that avoids obstacles ( ) and follows the roads ( ). Here's the step -by-step navigation : 1. Move down from to the end of the continuous road. After the move: 2."
57,2404.03622v3.pdf,ing Rate Succ Rate GPT-3.5 CoT 16.10 2.62 17.42 44.10 8.50 GPT-3.5 V oT 19.02 1.61 13.10 47.99 9.00 LLAMA3-8B CoT 4.65 0 28.73 47.24 16.50 LLAMA3-8B V oT 4.97 0.2 26.75 46.73 15.50 LLAMA3-70B CoT 19.90 2.62 49.01 56.41 26.00 LLAMA3-70B V oT 30.24 5.85 54.09 56.03 32.50 Table 3: Performance of V oT in GPT-3.5 and LLAMA3 models. Underline denotes statistical significance with p < 0.05 compared to corresponding CoT baseline using two-sample z-test. geometric patterns [ Cho19 ] and 3D spatial inform
57,2404.07143v2.pdf,"methods such as position interpolation techniques (Chen et al., 2023a) can be data efficient as they only adjust the positional bias in the attention layer, they are still costly for inference. The attention mechanism is also prone to the issues of attention sink (Xiao et al., 2023) and lost-in-the-middle (Liu et al., 2024). Consequently, they struggle in a regime where context length is longer than what was observed during training (Press et al., 2021; Kazemnejad et al., 2024). The proposed In"
57,Efficiency-Productivity-and-Speed-to-Deployment-MD007074.pdf,"n Enterprise Feature Store must be actively managed. Due consideration must be given not only to data freshness and the frequency of updates, but also to the removal of redundant features that are no longer being used. Precise cataloguing of all features is critical to prevent a feature store becoming a feature swamp. The dependencies between and within features (including data, processes, models and functions) must be clearly documented and understood. Users must be able to discover an"
158,2404.03622v3.pdf,"Tetromino L fits perfectly in the top left corner. Now, we can place the Tetromino I in the remaining space: ``` ``` The Tetromino I fits perfectly in the remaining space using Variation 1 (vertical placement). Therefore, the correct variation of Tetromino L that fits into the target rectangle is: A. 1 The initial attempt to place Variation 1 of Tetromino L was incorrect because it was not placed in the top left corner. Upon correcting the placement, we find that Variation 1"
57,2404.16130v1.pdf,"prehensive and structured overview of public figures across various sectors of the entertainment industry, including film, television, music, sports, and digital media. It lists multiple individuals, providing specific examples of their contributions and the context in which they are mentioned in entertainment articles, along with references to data reports for each claim. This approach helps the reader understand the breadth of the topic and make informed judgments without being misled. In cont"
40,Efficiency-Productivity-and-Speed-to-Deployment-MD007074.pdf,"al organization will determine what analysis is planned, what system resources are available, and how to define the refresh methodology. Obviously, it is not possible to compute every possible variable in an EFS and models will invariably have features that are specific to their requirements or features that need to be tweaked. The goal is to standardize the most common and widely used variables. In some cases, the standard EFS will be all that is required for a new analysis to be compl"
40,2404.07143v2.pdf,"th memory length of 65K and achieves 114x compression ratio. Position Embeddings (PE). As shown Figure 1, we dont use position embeddings for the key and query vectors of the compressive memory to store only global contextual information in the long-term memory. The PEs were applied to the QK vectors only after the compressive memory reading and update. 4.2 Long-context Language Modeling We trained and evaluated small Infini-Transformer models on PG19 (Rae et al., 2019) and Arxiv-math (Wu et al."
40,2404.03622v3.pdf,"s difficulty levels are provided in figure 9 and table 6 in appendix. We use this term for simplicity. In natural language navigation tasks, LLMs often output additional words in the extracted answer besides the expected object name. For example, ""Answer: You will find ..."". In this case, sub-string matching is adopted without affecting the correctness. Otherwise, exact matching is adopted for multiple choice questions in visual tasks. 6SettingsVisual Navigation Visual TilingNatural-Language Nav"


## 3 - Vector Embedding Process : Using AWS Bedrock

### 3.1 - The use of td_byone()

Using AWS Bedrock requires a single AMP operation. For that, we will use the function td_byone()

In [15]:
query_dataset = f"""
SELECT
{'\n, '.join([x for x in dataset.columns])}
, td_byone() AS  FOR_SINGLE_AMP_ATTRIBUTION
FROM {database}.{table_name}
"""

print(query_dataset)


SELECT
TD_ID
, TD_FILENAME
, CHUNKS
, td_byone() AS  FOR_SINGLE_AMP_ATTRIBUTION
FROM dm250067.denis_pdfs



In [16]:
tdml.DataFrame.from_query(query_dataset)



TD_ID,TD_FILENAME,CHUNKS,FOR_SINGLE_AMP_ATTRIBUTION
40,2404.16130v1.pdf,"which people inspect, engage with, and contextualize data within the broader scope of real-world activities (Koesten et al., 2021). Similarly, methods for extracting latent summarization queries from source texts also exist (Xu and Lapata, 2021), but such extracted questions can target details that betray prior knowledge of the texts. To evaluate the effectiveness of RAG systems for more global sensemaking tasks, we need questions that convey only a high-level understanding of dataset contents,",26535
120,2404.03622v3.pdf,"ning empty squares is : B. 3 The final configuration would look like this: Visualize the state after each reasoning step Lets think step by step. (a) Common behaviors of V oT and CoT prompting in visual tiling task, with the overall track rate of 87.1% and 57.4% respectively. To navigate from to , we need to find the path that avoids obstacles ( ) and follows the roads ( ). Here's the step -by-step navigation : 1. Move down from to the end of the continuous road. After the move: 2.",26535
57,2404.03622v3.pdf,ing Rate Succ Rate GPT-3.5 CoT 16.10 2.62 17.42 44.10 8.50 GPT-3.5 V oT 19.02 1.61 13.10 47.99 9.00 LLAMA3-8B CoT 4.65 0 28.73 47.24 16.50 LLAMA3-8B V oT 4.97 0.2 26.75 46.73 15.50 LLAMA3-70B CoT 19.90 2.62 49.01 56.41 26.00 LLAMA3-70B V oT 30.24 5.85 54.09 56.03 32.50 Table 3: Performance of V oT in GPT-3.5 and LLAMA3 models. Underline denotes statistical significance with p < 0.05 compared to corresponding CoT baseline using two-sample z-test. geometric patterns [ Cho19 ] and 3D spatial inform,26535
57,2404.07143v2.pdf,"methods such as position interpolation techniques (Chen et al., 2023a) can be data efficient as they only adjust the positional bias in the attention layer, they are still costly for inference. The attention mechanism is also prone to the issues of attention sink (Xiao et al., 2023) and lost-in-the-middle (Liu et al., 2024). Consequently, they struggle in a regime where context length is longer than what was observed during training (Press et al., 2021; Kazemnejad et al., 2024). The proposed In",26535
57,Efficiency-Productivity-and-Speed-to-Deployment-MD007074.pdf,"n Enterprise Feature Store must be actively managed. Due consideration must be given not only to data freshness and the frequency of updates, but also to the removal of redundant features that are no longer being used. Precise cataloguing of all features is critical to prevent a feature store becoming a feature swamp. The dependencies between and within features (including data, processes, models and functions) must be clearly documented and understood. Users must be able to discover an",26535
158,2404.03622v3.pdf,"Tetromino L fits perfectly in the top left corner. Now, we can place the Tetromino I in the remaining space: ``` ``` The Tetromino I fits perfectly in the remaining space using Variation 1 (vertical placement). Therefore, the correct variation of Tetromino L that fits into the target rectangle is: A. 1 The initial attempt to place Variation 1 of Tetromino L was incorrect because it was not placed in the top left corner. Upon correcting the placement, we find that Variation 1",26535
57,2404.16130v1.pdf,"prehensive and structured overview of public figures across various sectors of the entertainment industry, including film, television, music, sports, and digital media. It lists multiple individuals, providing specific examples of their contributions and the context in which they are mentioned in entertainment articles, along with references to data reports for each claim. This approach helps the reader understand the breadth of the topic and make informed judgments without being misled. In cont",26535
40,Efficiency-Productivity-and-Speed-to-Deployment-MD007074.pdf,"al organization will determine what analysis is planned, what system resources are available, and how to define the refresh methodology. Obviously, it is not possible to compute every possible variable in an EFS and models will invariably have features that are specific to their requirements or features that need to be tweaked. The goal is to standardize the most common and widely used variables. In some cases, the standard EFS will be all that is required for a new analysis to be compl",26535
40,2404.07143v2.pdf,"th memory length of 65K and achieves 114x compression ratio. Position Embeddings (PE). As shown Figure 1, we dont use position embeddings for the key and query vectors of the compressive memory to store only global contextual information in the long-term memory. The PEs were applied to the QK vectors only after the compressive memory reading and update. 4.2 Long-context Language Modeling We trained and evaluated small Infini-Transformer models on PG19 (Rae et al., 2019) and Arxiv-math (Wu et al.",26535
40,2404.03622v3.pdf,"s difficulty levels are provided in figure 9 and table 6 in appendix. We use this term for simplicity. In natural language navigation tasks, LLMs often output additional words in the extracted answer besides the expected object name. For example, ""Answer: You will find ..."". In this case, sub-string matching is adopted without affecting the correctness. Otherwise, exact matching is adopted for multiple choice questions in visual tasks. 6SettingsVisual Navigation Visual TilingNatural-Language Nav",26535


### 3.2 - Creation of the Embedding with AI_TEXTEMBEDDINGS

In [17]:
vector_store_table   = 'denis_pdfs_embeddings'
primary_index        = ['TD_ID']
text_column          = 'chunks'

embedding_model      = 'amazon.titan-embed-text-v1'
apitype              = 'aws'
region               = 'us-west-2'
authorization_object = 'AWSEmbeddingsAuth'

In [18]:
tdml.execute_sql(f"DROP TABLE {database}.{vector_store_table}")

TeradataCursor uRowsHandle=68 bClosed=False

In [19]:
embedding_query = f"""
CREATE MULTISET TABLE {database}.{vector_store_table} AS
(
SELECT *
FROM AI_TEXTEMBEDDINGS (
    ON ({'\n\t'.join(query_dataset.split('\n'))}) AS InputTable
    PARTITION BY FOR_SINGLE_AMP_ATTRIBUTION
    USING
        authorization({authorization_object})
        TextColumn('{text_column}')
        ApiType('{apitype}')
        REGION('{region}')
        ModelName('{embedding_model}')
        outputformat('vector')
        Accumulate({",".join(["'"+x+"'" for x in dataset.columns if x != text_column])})
) AS DT
) WITH DATA
PRIMARY INDEX ({','.join(primary_index)})
"""

print(embedding_query)


CREATE MULTISET TABLE dm250067.denis_pdfs_embeddings AS
(
SELECT *
FROM AI_TEXTEMBEDDINGS (
    ON (
	SELECT
	TD_ID
	, TD_FILENAME
	, CHUNKS
	, td_byone() AS  FOR_SINGLE_AMP_ATTRIBUTION
	FROM dm250067.denis_pdfs
	) AS InputTable
    PARTITION BY FOR_SINGLE_AMP_ATTRIBUTION
    USING
        authorization(AWSEmbeddingsAuth)
        TextColumn('chunks')
        ApiType('aws')
        REGION('us-west-2')
        ModelName('amazon.titan-embed-text-v1')
        outputformat('vector')
        Accumulate('TD_ID','TD_FILENAME','CHUNKS')
) AS DT
) WITH DATA
PRIMARY INDEX (TD_ID)



In [20]:
%%time
tdml.execute_sql(embedding_query)

CPU times: total: 15.6 ms
Wall time: 41 s


TeradataCursor uRowsHandle=69 bClosed=False

In [21]:
vector_table = tdml.DataFrame(tdml.in_schema(database, vector_store_table))
vector_table



TD_ID,TD_FILENAME,CHUNKS,Embedding,Message
95,2404.16130v1.pdf,"h/. Laskar, M. T. R., Hoque, E., and Huang, J. (2020). Query focused abstractive summarization via incorporating query relevance and transfer learning with transformer models. In Advances in Artificial Intelligence: 33rd Canadian Conference on Artificial Intelligence, Canadian AI 2020, Ottawa, ON, Canada, May 1315, 2020, Proceedings 33 , pages 342348. Springer. Laskar, M. T. R., Hoque, E., and Huang, J. X. (2022). Domain adaptation with pre-trained transform- ers for query-focused abstractive te","0.65625,-0.116211,0.119629,-0.214844,0.0476074,-0.220703,0.0708008,-9.01222e-05,-0.00836182,0.0693359,0.137695,0.695312,-0.0500488,0.179688,-0.316406,0.314453,0.229492,0.416016,-0.925781,0.0366211,-0.165039,-0.137695,-0.206055,0.351562,-0.132812,0.196289,0.104004,0.585938,-0.458984,0.00166321,0.314453,0.065918,0.546875,0.300781,-0.287109,0.121582,0.515625,0.367188,-0.12207,-0.0874023,0.769531,-0.135742,0.111328,-0.0786133,0.0952148,-0.00805664,-0.125,0.0922852,-0.78125,0.0849609,-0.134766,-0.271484,0.239258,0.328125,-0.265625,-0.125977,-0.0510254,-0.0620117,0.0678711,0.330078,0.100586,0.359375,0.078125,0.199219,-0.188477,0.269531,0.28125,-0.0512695,0.550781,0.273438,-0.192383,-0.0108032,0.378906,0.265625,1.14844,-0.125,-0.376953,0.585938,0.111328,-0.490234,-0.585938,0.558594,0.0756836,0.466797,0.139648,0.206055,-0.447266,0.0554199,0.000127792,0.382812,0.335938,0.176758,0.355469,-0.259766,0.447266,-0.198242,-0.0177002,0.175781,0.0966797,-0.515625,0.0568848,-0.339844,0.100098,-0.0678711,0.0834961,-0.214844,-0.0",
112,2404.03622v3.pdf,"ou made following movements: 1. Move right to the end of continuous road. 2. Move down to the end of continuous road. 3. Move left to the end of continuous road. What's the direction of next movement? A. Up B. Left C. Down D. Right Visualize the state after each reasoning step.Navigation Task: for a provided map, is the home as starting point, is the office as the destination. means the road, means the obstacle. There exists one and only one viable route for each map. Each ste","0.792969,-0.0673828,0.255859,0.133789,0.648438,0.224609,0.0476074,0.000133514,-0.251953,0.345703,0.703125,-0.462891,0.384766,0.396484,-0.0576172,-0.234375,0.384766,0.613281,-0.5625,0.298828,-0.753906,0.265625,-0.0917969,-0.100098,0.115234,0.195312,0.65625,-0.488281,0.0234375,-0.365234,-0.546875,0.333984,0.216797,0.46875,-0.691406,-0.519531,0.246094,-0.097168,0.425781,-0.124512,-0.851562,0.59375,-0.198242,0.318359,-0.396484,0.208008,0.0927734,-0.78125,0.53125,0.353516,0.00750732,0.0576172,-0.423828,-0.392578,-0.200195,0.3125,0.279297,-0.376953,-0.0546875,-0.209961,-0.410156,0.613281,0.118652,-0.408203,-0.140625,0.273438,0.546875,-0.378906,0.0742188,0.496094,0.114258,-0.554688,0.292969,0.103516,0.628906,0.294922,0.0446777,0.114746,0.0344238,0.378906,0.257812,-0.211914,-0.546875,0.28125,-0.251953,0.765625,0.458984,0.167969,0.000137329,0.00891113,-0.34375,0.316406,0.019165,-0.339844,0.0610352,0.0322266,-0.310547,0.081543,0.578125,-0.458984,-0.388672,0.320312,0.090332,0.225586,-0.480469,0.478516,0.205078,0.0844727",
49,2404.03622v3.pdf,"sured by the proportion of correct answers when the corresponding visualization is generated accurately. As could be seen from Table 2, LLMs demonstrate promising potential in performing multi-hop visualization while adhering to spatial constraints, with compliance rates of approximately 51-52%. However, the relatively low accuracy of state visualization (around 24%-26%) indicates a need for significant improvements in this area. Despite this limitation, LLMs are able to make correct decisions i","0.8125,-0.152344,0.439453,-0.419922,-0.109375,-0.142578,-0.15625,0.000246048,-0.180664,0.111328,0.353516,-0.10498,0.227539,0.0429688,-0.351562,0.00601196,0.15625,0.585938,-0.318359,0.186523,-0.11084,0.277344,0.105957,-0.115723,0.210938,-0.0678711,0.168945,0.429688,-0.104004,-0.0144653,-0.140625,0.695312,0.337891,-0.09375,-0.230469,-0.0284424,-0.106934,0.285156,-0.0371094,0.248047,-0.131836,0.832031,-0.166016,0.332031,-0.144531,0.181641,-0.126953,0.0610352,-0.330078,0.267578,0.605469,0.271484,-0.59375,0.158203,0.332031,-0.449219,0.632812,-0.0849609,-0.206055,0.231445,0.0371094,0.0654297,0.496094,0.146484,-0.189453,-0.0693359,0.133789,-0.558594,-0.191406,0.408203,-0.0991211,-0.566406,0.337891,-0.00579834,0.554688,-0.185547,-0.0458984,0.0179443,0.0218506,-0.109375,-0.222656,0.152344,-0.396484,0.453125,0.326172,0.0620117,0.421875,0.237305,0.000133514,-0.671875,-0.210938,0.0483398,-0.239258,0.0378418,0.235352,-0.020874,-0.355469,0.0229492,0.404297,-0.761719,-0.353516,0.209961,-0.384766,0.0908203,-0.404297,-0.16015",
49,2404.07143v2.pdf,that a 8B LLaMA model can solve the task up to 32K length when fine-tuned with the same 32K length inputs with Position Interpolation. We take this challenge further and fine-tune on only 5K length inputs to test on 1M length regime. Input lengthRouge overall score 17181920 16K 32K 64K 128K 256K 500K Figure 4: Infini-Transformers obtain better Rouge overall scores with more book text pro- vided as input.Table 3 reports the token-level accuracy for test subsets with input lengths ranging from 32,"0.423828,0.287109,0.25,-0.173828,-0.034668,-0.267578,-0.347656,0.000189781,-0.419922,-0.298828,0.460938,0.0727539,0.166992,0.726562,-0.65625,0.09375,0.198242,0.466797,-0.291016,0.208984,-0.357422,-0.223633,0.170898,0.241211,0.527344,-0.192383,-0.355469,0.427734,0.0986328,0.0228271,-0.462891,0.458984,0.421875,0.000419617,0.203125,-0.115234,-0.253906,0.289062,0.660156,0.130859,0.324219,0.166992,-0.211914,-0.0322266,0.090332,-0.242188,0.322266,-0.0439453,-0.15332,0.0478516,-0.0625,0.353516,-0.628906,0.155273,0.0368652,0.163086,0.100098,-0.25,-1.05469,0.28125,0.133789,0.15332,0.167969,0.0844727,0.0927734,0.203125,0.0483398,-0.238281,-0.265625,0.265625,0.0908203,-0.617188,0.375,-0.0678711,0.578125,0.291016,-0.161133,0.238281,0.0197754,-0.0834961,-0.574219,0.289062,0.0991211,0.490234,-0.277344,-0.114746,0.0147705,0.455078,0.0001297,-0.443359,-0.455078,0.28125,0.318359,-0.0275879,0.398438,-0.230469,-0.457031,0.151367,-0.148438,-0.296875,0.00567627,0.0456543,-0.0314941,-0.105469,-0.060791,0.00836182,0.699219,-0.04321",
49,2404.16130v1.pdf,"informed judgements about the topic? Directness . How specifically and clearly does the answer address the question? For our evaluation, the LLM is provided with the question, target metric, and a pair of answers, and asked to assess which answer is better according to the metric, as well as why. It returns the winner if one exists, otherwise a tie if they are fundamentally similar and the differences are negligible. To account for the stochasticity of LLMs, we run each comparison five times an","0.435547,0.046875,0.111328,-0.808594,0.0111694,-0.161133,-0.375,0.000192642,-0.0629883,0.00811768,0.135742,-0.201172,0.373047,0.322266,-0.166992,-0.337891,0.464844,0.671875,-0.324219,0.175781,-0.523438,0.0917969,0.157227,0.0388184,0.636719,-0.181641,0.0105591,0.090332,0.265625,0.00616455,0.15918,0.613281,0.785156,-0.101074,0.251953,-0.10791,-0.199219,0.0361328,0.0654297,0.322266,0.251953,0.229492,-0.0878906,0.200195,-0.503906,0.114258,0.261719,-0.230469,0.265625,0.161133,0.177734,0.0371094,-0.265625,0.0639648,0.0476074,-0.208984,0.0942383,0.142578,-0.664062,0.045166,-0.0834961,0.0432129,-0.167969,0.0397949,-0.160156,0.582031,0.443359,-0.287109,-0.519531,0.353516,-0.238281,-0.474609,0.233398,-0.660156,0.761719,0.177734,-0.0698242,0.15332,-0.132812,0.691406,-0.373047,0.222656,-0.212891,-0.269531,-0.253906,0.0184326,-0.225586,0.439453,0.000128746,-0.105957,-0.5625,0.484375,-0.000881195,-0.02771,0.296875,-0.0639648,-0.367188,-0.167969,0.464844,-0.503906,0.0673828,0.178711,-0.28125,0.0566406,-0.414062,0.160156,0.2",
129,2404.03622v3.pdf,"nding visualizations in an interleaved manner. Each visualization reflects the temporal casuality of the system state. This kind of interleaved sequence tracks the system state over time, thus reflecting spatiotemporal casuality. D Performance Trends Across Levels In this analysis, we examine performance trends across varying difficulty levels in the next-step prediction task for models utilizing either CoT or V oT methods. These trends are crucial for understanding the inherent unpredictability","0.78125,-0.220703,0.396484,-0.527344,0.0932617,-0.271484,-0.162109,0.000118256,-0.166016,0.0844727,0.462891,0.010498,0.197266,0.15332,-0.507812,-0.201172,0.0795898,-0.234375,-0.451172,0.349609,-0.115234,-0.0134277,-0.0473633,-0.227539,0.166992,0.289062,0.244141,0.182617,0.0244141,-0.165039,-0.824219,0.373047,0.292969,0.326172,-0.227539,0.163086,-0.175781,0.707031,0.118652,0.396484,-0.359375,0.251953,0.111816,0.390625,-0.162109,0.223633,-0.0131836,-0.0864258,0.114746,0.216797,0.235352,0.145508,-0.683594,-0.189453,0.507812,-0.457031,0.168945,0.117188,-0.18457,0.0634766,0.0351562,0.203125,0.0422363,0.0617676,-0.421875,-0.0888672,-0.316406,-0.539062,0.183594,0.486328,-0.0253906,-0.839844,0.429688,-0.384766,0.484375,0.0429688,-0.300781,-0.179688,-0.150391,-0.285156,0.0517578,-0.0961914,0.052002,0.769531,0.0771484,-0.0415039,0.279297,0.136719,0.0001297,-0.253906,-0.0166016,-0.137695,0.0761719,-0.119629,0.369141,0.0883789,-0.375,-0.259766,0.165039,-0.527344,-0.152344,-0.00357056,-0.135742,0.470703,-0.208984,-0.26171",
49,Efficiency-Productivity-and-Speed-to-Deployment-MD007074.pdf,"t Practice guidelines for creating an EFS The rationale behind creating an enterprise feature store is to support predictive analytic models that produce ongoing value to the business which exceeds the cost incurred in creating them. Therefore, the business community must play a large role in defining the objectives and focus of the EFS. The specific contents, logic, and physical storage aspects of an EFS will require a team effort to outline and without active business involvement and","0.554688,0.0111694,0.605469,-0.310547,0.249023,-0.0834961,0.234375,0.00038147,-0.137695,0.161133,-0.0252686,0.429688,-0.245117,-0.125977,-0.277344,0.285156,-0.00543213,0.441406,-0.00182343,0.166016,0.0629883,0.351562,-0.0869141,0.306641,-0.164062,0.0111084,0.597656,-0.380859,-0.177734,0.0712891,-0.589844,-0.535156,-0.0893555,-0.160156,-0.0334473,0.143555,-0.146484,0.0157471,0.773438,-0.157227,-0.122559,0.410156,0.0908203,0.00370789,-0.341797,-0.566406,0.667969,0.302734,-0.414062,0.404297,0.219727,-0.302734,-0.447266,0.267578,0.275391,-0.314453,-0.0117798,0.423828,0.0529785,-0.137695,-0.10791,0.648438,-0.00424194,0.013916,-0.0119019,0.193359,-0.0274658,-0.00726318,0.625,0.0957031,-0.285156,-0.546875,0.106445,0.0524902,-0.28125,0.102051,-0.0751953,-0.168945,-0.249023,-0.179688,-0.292969,0.169922,-0.0236816,-0.15332,0.351562,-0.219727,0.117188,-0.189453,0.000133514,0.197266,0.126953,0.419922,0.230469,-0.166992,0.263672,0.324219,-0.523438,0.171875,-0.186523,-0.217773,0.0756836,-0.253906,0.433594,-0.104004,-0.1757",
95,2404.03622v3.pdf,"[SM71] Roger N. Shepard and Jacqueline Metzler. Mental rotation of three-dimensional objects. Science , 171:701 703, 1971. 13[SMM21] Harini Sampath, Alice Merrick, and Andrew Macvean. Accessibility of command line interfaces. In Proceedings of the 2021 CHI Conference on Human Factors in Computing Systems , pages 110, 2021. [SZL22] Zhengxiang Shi, Qiang Zhang, and Aldo Lipani. Stepgame: A new benchmark for robust multi-hop spatial reasoning in texts. AAAI Conference on Artificial Intelligence ,","0.498047,-0.0256348,-0.00297546,-0.00680542,-0.0385742,-0.045166,-0.124512,0.000541687,0.200195,0.0126953,0.142578,0.257812,0.0654297,-0.060791,-0.275391,-0.300781,-0.115723,0.275391,-0.208984,0.164062,-0.355469,0.0791016,0.163086,0.220703,-0.142578,0.263672,0.332031,-0.114746,0.065918,-0.291016,-0.314453,0.144531,-0.0483398,0.00872803,-0.212891,0.0678711,0.222656,-0.0922852,-0.0478516,-0.210938,-0.115234,0.050293,-0.124512,0.0255127,-0.195312,0.0893555,-0.132812,-0.0588379,-0.00491333,0.167969,-0.0140381,0.166992,0.102539,0.0571289,-0.166016,-0.4375,0.353516,0.118164,0.0238037,0.00952148,0.0957031,0.0673828,0.0917969,0.337891,-0.0349121,-0.0620117,0.414062,-0.267578,0.429688,0.0388184,-0.168945,-0.135742,0.294922,-0.200195,0.314453,-0.125,0.147461,0.0742188,0.0908203,-0.183594,0.119629,0.147461,-0.269531,0.371094,-0.165039,0.00704956,0.135742,0.196289,0.0001297,-0.769531,0.239258,0.337891,-0.0524902,0.0517578,0.0273438,0.198242,0.0703125,-0.25,0.0712891,0.118652,-0.137695,0.0483398,-0.0505371,-0.0932617,-0.1",
99,2404.03622v3.pdf,"D coordinate system with origin (0, 0) as the starting point. To guarantee an unique answer in each navigation map, the moving distance of each instruction is dynamically calculated to avoid overlapping. Each time when an overlapping is detected, the moving distance of previous instruction will be increased by 1 unit recursively until overlapping is resolved. As the distance is determined, those corresponding points are added to the navigating path. After all instructions are completed, the fina","0.0432129,0.249023,0.369141,0.115723,0.0211182,-0.219727,-0.201172,0.00028801,0.0055542,0.0952148,0.382812,-0.0378418,0.361328,0.353516,-0.0634766,-0.0629883,0.277344,0.457031,-0.353516,0.326172,-0.318359,0.332031,0.0615234,0.320312,-0.212891,0.234375,0.339844,-0.230469,-0.0153809,-0.328125,-0.306641,0.0578613,0.326172,0.363281,-0.71875,-0.152344,-0.0878906,0.287109,-0.00268555,-0.0688477,-0.412109,0.241211,-0.0708008,0.259766,-0.133789,0.0038147,-0.0371094,-0.310547,0.478516,0.271484,0.675781,-0.111328,-0.667969,0.0610352,-0.168945,-0.0554199,0.103516,-0.197266,0.128906,-0.143555,-0.386719,0.133789,0.18457,-0.375,-0.0859375,-0.0795898,0.326172,-0.503906,-0.0319824,-0.0279541,0.234375,-0.613281,0.145508,-0.00787354,0.235352,0.320312,0.183594,0.390625,-0.100586,0.209961,0.179688,0.0402832,-0.102539,0.396484,-0.243164,0.458984,0.378906,0.240234,0.000130653,-0.302734,-0.0617676,0.353516,0.0791016,0.0805664,0.236328,0.116699,-0.259766,0.103027,0.253906,-0.460938,-0.206055,0.300781,0.147461,0.10791,-0.308594,0.224",
99,2404.16130v1.pdf,"first graph rag: Retrieval-augmented genera- tion with llm based on knowledge graphs. https://www .nebula-graph .io/posts/graph-RAG. Neo4J (2024). Project NaLLM. https://github .com/neo4j/NaLLM. Newman, M. E. (2006). Modularity and community structure in networks. Proceedings of the national academy of sciences , 103(23):85778582. Ram, O., Levine, Y ., Dalmedigos, I., Muhlgay, D., Shashua, A., Leyton-Brown, K., and Shoham, Y . (2023). In-context retrieval-augmented language models. Transactions","0.141602,0.507812,0.0678711,-0.251953,0.0864258,0.0351562,-0.267578,0.000347137,-0.259766,0.135742,-0.25,0.111328,0.300781,-0.00982666,-0.349609,-0.220703,0.423828,0.601562,-0.261719,0.203125,-0.367188,0.020752,-0.175781,-0.060791,0.211914,-0.161133,-0.15332,-0.0463867,-0.214844,-0.122559,0.0612793,0.408203,0.263672,0.53125,0.0932617,0.103516,-0.0140381,0.103516,0.302734,0.24707,0.112305,-0.40625,-0.330078,-0.0461426,0.152344,0.345703,0.065918,0.107422,0.542969,-0.149414,0.0148315,0.059082,-0.185547,0.199219,-0.192383,-0.00561523,0.057373,-0.320312,-0.0834961,0.020874,0.0810547,0.0186768,0.0673828,0.167969,-0.261719,0.308594,-0.335938,-0.378906,-0.220703,-0.074707,-0.21582,-0.289062,0.394531,-0.00927734,0.554688,0.0888672,-0.0267334,0.202148,0.100098,0.0166016,-0.314453,0.0698242,-0.171875,0.0266113,0.0830078,-0.115723,-0.384766,0.251953,0.000132561,-0.195312,-0.155273,0.289062,-0.300781,0.171875,0.166992,-0.185547,-0.0673828,-0.248047,0.291016,-0.460938,0.0664062,0.0649414,-0.090332,0.0397949,-0.214844,0.170",


In [22]:
vector_table.columns

['TD_ID', 'TD_FILENAME', 'CHUNKS', 'Embedding', 'Message']

In [23]:
print(tdml.execute_sql(f"SHOW TABLE {database}.{vector_store_table}").fetchall()[0][0].replace('\r','\n'))

CREATE MULTISET TABLE dm250067.denis_pdfs_embeddings ,FALLBACK ,
     NO BEFORE JOURNAL,
     NO AFTER JOURNAL,
     CHECKSUM = DEFAULT,
     DEFAULT MERGEBLOCKRATIO,
     MAP = TD_MAP2
     (
      TD_ID INTEGER,
      TD_FILENAME VARCHAR(1024) CHARACTER SET LATIN NOT CASESPECIFIC,
      CHUNKS VARCHAR(32000) CHARACTER SET LATIN NOT CASESPECIFIC,
      Embedding SYSUDTLIB.Vector,
      Message VARCHAR(32000) CHARACTER SET UNICODE NOT CASESPECIFIC)
PRIMARY INDEX ( TD_ID );


### 3.3 - Vector Normalization: TD_VECTORNORMALIZE

In [25]:
vector_column            = 'Embedding'
idcolumns                = ['TD_ID']
embedding_dimension      = 1536
accumulate_columns       = ['TD_FILENAME','Message']

In [26]:
query_normalization = f"""
SELECT *
FROM TD_Vectornormalize(
    ON {database}.{vector_store_table} AS InputTable
    USING
        IDColumns( {','.join(["'"+x+"'" for x in idcolumns])})
        TargetColumns('{vector_column}')
        Accumulate({','.join(["'"+x+"'" for x in accumulate_columns])})
        Approach('UNITVECTOR')
        EmbeddingSize({embedding_dimension})    
) as DT
"""

print(query_normalization)


SELECT *
FROM TD_Vectornormalize(
    ON dm250067.denis_pdfs_embeddings AS InputTable
    USING
        IDColumns( 'TD_ID')
        TargetColumns('Embedding')
        Accumulate('TD_FILENAME','Message')
        Approach('UNITVECTOR')
        EmbeddingSize(1536)    
) as DT



In [None]:
tdml.DataFrame.from_query(query_normalization)