## first time setup
install packages and configure NLTK

In [None]:
import sys
!{sys.executable} -m pip install pandas==1.3.5 numpy==1.24 nltk

## main script

### setup, data loading and filtering

In [None]:
import pandas as pd
pd.set_option('display.max_columns', None)  # or 1000
pd.set_option('display.max_rows', None)  # or 1000
pd.set_option('display.max_colwidth', None)  # or 199
import os
import numpy as np
import nltk
import json

In [None]:
def load_dataframes(base_dir):
    dfs = []
    for f in os.listdir(base_dir):
        df = pd.read_pickle(f"{base_dir}/{f}")

        dfs.append(df)

    dataset = pd.concat(dfs)
    
    return dataset

In [None]:
def load_predefined_queries(jsonfile):
    
    f = open (jsonfile, "r")
    compdata = json.load(f)
    
    queries = {}
    
    for t in compdata["description"]["tasks"]:
        name = t["name"]
        querystr = ""
        for h in t["hints"]:
            if h["type"]=="TextHint":
                querystr = h["text"]
        queries[name] = querystr
    
    return queries
    

In [None]:
def reformat_value(x):
    txt = x["value"]
    
    prefixes = ["CLIP: ","Temporal CLIP: "]
    
    for p in prefixes:
        if txt.startswith(p):
            txt = txt[len(p):]
    
    x["value"] = txt
    return x

In [None]:
dataset = load_dataframes("/data/vbse2022/data/dataframes/vbse2022_logs_dataframes/vbse2022")
dataset.shape

In [None]:
predefinedqueries = load_predefined_queries("/data/vbse2022/data/DRES data/VBS Extended Test Nov 11, 2022.json")
print(predefinedqueries)

In [None]:
textqueries = dataset.loc[dataset["category"]=="TEXT"]
textqueries.shape

In [None]:
# reformat values
textqueries = textqueries.apply(reformat_value, axis=1)

### query change events
- check if initial query changed
- add delta length of queries in characters

In [None]:
textqueries["value"].iloc[:1]

In [None]:
querychanges = pd.DataFrame(columns=list(textqueries.columns.values)+["delta_len"])


for task in textqueries["task"].unique():
    queries_ta = textqueries.loc[textqueries["task"]==task]
    for team in queries_ta["team"].unique():
        queries_ta_te = queries_ta.loc[queries_ta["team"]==team]
        queries_ta_te = queries_ta_te.sort_values(by=['elapsed_since_task_start_ms'])
        
        # check if query is different from previous (and first from predefined), otherwise skip
        prev_query = predefinedqueries[task]
        prev_query_len = len(prev_query)
            
        for idx in range(0,queries_ta_te.shape[0]):
            
            if queries_ta_te.iloc[idx]["value"] != prev_query:
                qlen = len(queries_ta_te.iloc[idx]["value"])
                
                row = queries_ta_te.iloc[idx]
                row["delta_len"] = qlen-prev_query_len
            
                querychanges = querychanges.append(row,ignore_index=True)

                prev_query = queries_ta_te.iloc[idx]["value"]
                prev_query_len = len(prev_query)

            
print(querychanges)