# IsBit Research and Testing Notebook

## Initial Plan

First plan is to split the data set of 5451 question text data into three data sets. The data sets are described below how they are formatted, the first one containing 80% of the parent data is for local training of different clustering models. The remaining 20% are further devided into two unseen test sets of 10% each where the second one is for local evaluation of models hence containg the coarse-label. and one for remote testing exported to the application.

`ML/data/QAQC/swe_local_train.csv`: 

Is the first mentioned data set where data rows formatted as: `Hur utvecklades träldomen i Ryssland?`. The question to train the clustering models on. 

`ML/data/QAQC/swe_qaqc_local_test.csv`: 

Is the second data set formatted as `Hur utvecklades träldomen i Ryssland?, DESC`.  The question with true lable for local evaluation.

`ML/data/QAQC/swe_qaqc_remote_test.csv` :

Is the third data set formatted as the second one, ment to be used for user-tests exported to the application.

---

## Preprocessing

The bwelow cell will create the three cells from the parent data set, but keep in mind that every run of this cell will overwrite the data files. ment to be run once for an iteration of testing/building models. If one wants to do further preprocessing i suggest making a new function and saving data to subfolders of the `output_dir` and label the files with some iteration tag.

The cell below will filter the source data and create a new formatted csv file with complete data set where the following work is done,

- Removing commas inside the text field replacing then with pipe signs | .
- Removing outer quotations marks of the textfield.
- Insert escape chars where needed mostly text fields containing quotes. 

Exampels of some normal and edge cases in the data can be found under `ML/data_exampels`

In [22]:
import pandas as pd
import csv
import re
import os

source_path = "data/QAQC_v1/swe_qaqc_train.csv"        # path to the source data, need to be downloaded ignored for git
output_path = "data/QAQC_v1/swe_qaqc_prep_train.csv"   # set the same output path to not push data to git

# helper to remove outer quotation marks
def strip_outer_quotationmarks(q):
    if q.startswith('"') and q.endswith('"'):
        q = q[1:-1]  
    return q.strip()

# helper to remove commas in the question text, causes problems since the cols are set by commas in csv file
def replace_commas(line):
    pattern = r'"([^"]*?)"'
    def replace_commas(match):
        return '"' + match.group(1).replace(',', '|') + '"'
    return re.sub(pattern, replace_commas, line)

def process_csv(source_data_path, output_data_path, temp_file_path="temp_file.csv"):
    with open(source_data_path, 'r', encoding='utf-8') as infile:
        modified_lines = [replace_commas(line.strip()) for line in infile]

    with open(temp_file_path, 'w', encoding='utf-8', newline='') as outfile:
        outfile.write('\n'.join(modified_lines))

    temp_data = pd.read_csv(temp_file_path, quoting=csv.QUOTE_NONE)  
    questions = temp_data["text"].tolist()
    no_comma_questions = [strip_outer_quotationmarks(q) for q in questions]

    coarse_labels = list(map(lambda x: x.split(":")[0], temp_data["verbose label"].tolist())) # only keep the coarse lable 
    zipped = list(zip(no_comma_questions, coarse_labels))

    with open(output_data_path, 'w', encoding='utf-8', newline='') as outfile:
        writer = csv.writer(outfile, quoting=csv.QUOTE_MINIMAL)
        writer.writerow(["text", "coarse label"])
        writer.writerows(zipped)

    if os.path.exists(temp_file_path):
        os.remove(temp_file_path)


    #print(f"Processed data saved to {output_data_path}") # trace print
    return zipped

# data formatted and ready for splitting 
process_csv(source_path, output_path)

[('Hur utvecklades träldomen i Ryssland?', 'DESC'),
 ('Vilka filmer inkluderade karaktären Popeye Doyle?', 'ENTY'),
 ('Hur kan jag hitta en lista över kändisars riktiga namn?', 'DESC'),
 ('Vilken fjäderfä fångar rampljuset efter det kinesiska apans år?', 'ENTY'),
 ('Vad är den fullständiga formen av .com?', 'ABBR'),
 ('Vilken föraktlig skurk stal korken från min lunch?', 'HUM'),
 ('Vilket lag blev basebollens St. Louis Browns?', 'HUM'),
 ('Vad är det äldsta yrket?', 'HUM'),
 ('Vad är leverenzymer?', 'DESC'),
 ('Namnge den ärriga prisjägaren i The Old West.', 'HUM'),
 ('När föddes Ozzy Osbourne?', 'NUM'),
 ('Varför faller tyngre föremål snabbare nedåt?', 'DESC'),
 ('Vem var Yankees stolthet?', 'HUM'),
 ('Vem dödade Gandhi?', 'HUM'),
 ('Vad anses vara den dyraste katastrofen som försäkringsbranschen någonsin har stött på?',
  'ENTY'),
 ('Vilken utbredd amerikansk stat har flest flygplatser?', 'LOC'),
 ('Vad behandlade den enda upphävda ändringen av USA:s konstitution?', 'DESC'),
 ('Hur m

In [None]:
# working preprocessing

import pandas as pd
import csv
import re

test_path = "data/simple_tests/j_tests.csv"
output_test_path = "data/simple_tests/j_tests_out.csv"
temp_file_path = "data/simple_tests/temp_file.csv"

def strip_outer_quotationmarks(q):
    if q.startswith('"') and q.endswith('"'):
        q = q[1:-1]  
    return q.strip()

def replace_commas(line):
    pattern = r'"([^"]*?)"'
    def replace_commas(match):
        return '"' + match.group(1).replace(',', '|') + '"'
    
    return re.sub(pattern, replace_commas, line)

with open(test_path, 'r', encoding='utf-8') as infile:
    modified_lines = []
    for line in infile:
        modified_lines.append(replace_commas(line.strip()))

with open(temp_file_path, 'w', encoding='utf-8', newline='') as outfile:
    outfile.write('\n'.join(modified_lines))

temp_data = pd.read_csv(temp_file_path, quoting=csv.QUOTE_NONE)  
questions = temp_data["text"].tolist()
no_comma_questions = [strip_outer_quotationmarks(q) for q in questions]

coarse_labels = list(map(lambda x: x.split(":")[0], temp_data["verbose label"].tolist()))
zipped = list(zip(no_comma_questions, coarse_labels))


with open(output_test_path, 'w', encoding='utf-8', newline='') as outfile:
    writer = csv.writer(outfile, quoting=csv.QUOTE_MINIMAL) 
    writer.writerow(["text", "coarse label"])
    writer.writerows(zipped)

print(zipped)




In [None]:
## ----- gussy help 

import pandas as pd
import csv
import re

data_path = "data/simple_tests/j_tests.csv"
output_test_path = "data/simple_tests/j_tests_out.csv"

data = pd.read_csv(data_path,  quoting=csv.QUOTE_NONE)

questions = data["text"].to_list()
questions = [re.sub(r'^(?:"(.*)"|\'(.*)\')$', r'\1\2', q).strip() for q in questions]
print(questions)

i = 0
print ("qeustions line by line")
while i < len(questions):
    print(questions[i])
    i = i + 1

entity = list(map(lambda x: x.split(":")[0], data["verbose label"].to_list()))
zipped = list(zip(questions, entity))

with open(output_test_path, mode='w', newline='', encoding='utf-8') as file:
    writer = csv.writer(file, quotechar=None)
    writer.writerows(zipped)

**PREP TESTING**

In [45]:
# ---------------------- kinda working ----------------------------

# import pandas as pd
# from sklearn.model_selection import train_test_split
# import os

# def preprocess_and_split_data(input_file_path, output_dir):
#     data = pd.read_csv(input_file_path)

#     train_data, test_data = train_test_split(data, test_size=0.2, random_state=42, shuffle=True)
#     local_test_data, remote_test_data = train_test_split(test_data, test_size=0.5, random_state=42, shuffle=True)

#     train_data_for_clustering = train_data[['text']]

#     local_test_data_for_eval = local_test_data[['text', 'verbose label']]

#     remote_test_data_for_eval = remote_test_data[['text', 'verbose label']]

#     os.makedirs(output_dir, exist_ok=True)
    
#     train_data_file = os.path.join(output_dir, "swe_qaqc_preproc_train.csv")
#     local_test_data_file = os.path.join(output_dir, "swe_qaqc_local_test.csv")
#     remote_test_data_file = os.path.join(output_dir, "swe_qaqc_remote_test.csv")

#     train_data_for_clustering.to_csv(train_data_file, index=False, header=True)
#     local_test_data_for_eval.to_csv(local_test_data_file, index=False, header=True)
#     remote_test_data_for_eval.to_csv(remote_test_data_file, index=False, header=True)

#     print(f"Data preprocessing complete. Files saved in: {output_dir}")


# input_file_path = "data/QAQC/swe_qaqc_train.csv"  
# output_dir = "data/QAQC"
# preprocess_and_split_data(input_file_path, output_dir)

# ----------------------- bit scuffed ----------------------------

# import pandas as pd
# from sklearn.model_selection import train_test_split
# import os

# def clean_text(text):
#     # Remove leading and trailing quotation marks and embedded quotes
#     text = text.strip().replace('"', '').replace("'", '')
#     return text

# def preprocess_and_split_data(input_file_path, output_dir):    
#     # Load the dataset
#     data = pd.read_csv(input_file_path)

#     # First split: 80% training, 20% testing with stratification
#     train_data, test_data = train_test_split(data, test_size=0.2, random_state=42, shuffle=True, stratify=data['verbose label'])

#     # Second split: Further divide test data into 50% local and 50% remote
#     local_test_data, remote_test_data = train_test_split(test_data, test_size=0.5, random_state=42, shuffle=True, stratify=test_data['verbose label'])

#     # Format for each dataset
#     # 1. Preprocessing training data for clustering
#     train_data_for_clustering = train_data['text'].apply(clean_text).str.split('?').str[0] + '?'

#     # 2. Preprocessing local test data (keeping labels in CAPS)
#     local_test_data_for_eval = local_test_data[['text', 'verbose label']]
#     local_test_data_for_eval['verbose label'] = local_test_data_for_eval['verbose label'].str.split(':').str[0].str.upper()  # Keep only the first part of the verbose label and make it uppercase
#     local_test_data_for_eval['text'] = local_test_data_for_eval['text'].apply(clean_text)
#     local_test_data_for_eval = local_test_data_for_eval.apply(lambda x: f"{x['text']},{x['verbose label']}", axis=1)

#     # 3. Preprocessing remote test data
#     remote_test_data_for_eval = remote_test_data[['text', 'verbose label']]
#     remote_test_data_for_eval['verbose label'] = remote_test_data_for_eval['verbose label'].str.split(':').str[0].str.upper()  # Keep only the first part of the verbose label and make it uppercase
#     remote_test_data_for_eval['text'] = remote_test_data_for_eval['text'].apply(clean_text)
#     remote_test_data_for_eval = remote_test_data_for_eval.apply(lambda x: f"{x['text']},{x['verbose label']}", axis=1)

#     # Save the datasets into respective CSV files
#     os.makedirs(output_dir, exist_ok=True)
    
#     train_data_file = os.path.join(output_dir, "swe_qaqc_prep_train.csv")
#     local_test_data_file = os.path.join(output_dir, "swe_qaqc_prep_local_test.csv")
#     remote_test_data_file = os.path.join(output_dir, "swe_qaqc_prep_remote_test.csv")

#     # Save the datasets without headers and indexes
#     train_data_for_clustering.to_csv(train_data_file, index=False, header=False)
#     local_test_data_for_eval.to_csv(local_test_data_file, index=False, header=False)
#     remote_test_data_for_eval.to_csv(remote_test_data_file, index=False, header=False)

#     print(f"Data preprocessing complete. Files saved in: {output_dir}")

# # Example usage
# input_file_path = "data/QAQC/swe_qaqc_train.csv"  # the parent dataset path
# output_dir = "ML/data/QAQC"  

# preprocess_and_split_data(input_file_path, output_dir)


# #besta jag har hittils men funkar ej

# import pandas as pd
# from sklearn.model_selection import train_test_split
# import os

# def preprocess_and_split_data(input_file_path, output_dir):    

#     # Read the data, handling potential issues with quotes
#     data = pd.read_csv(input_file_path, quotechar='"', skipinitialspace=True)

#     # Clean the 'text' column to remove only the outer quotes
#     data['text'] = data['text'].str.strip('"')  # Removes outer quotation marks only

#     # First split: 80% training, 20% testing with stratification based on the 'verbose label'
#     train_data, test_data = train_test_split(data, test_size=0.2, random_state=42, shuffle=True, stratify=data['verbose label'])

#     # Check class distribution in the test data
#     class_counts = test_data['verbose label'].value_counts()
#     print("Class distribution in test data:\n", class_counts)

#     # Ensure that no class has less than 2 samples
#     if any(class_counts < 2):
#         print("Warning: Some classes have less than 2 samples in the test data. Adjusting strategy.")
#         # You may want to modify your approach here
#         # For instance, you could skip the stratification for the second split
#         local_test_data, remote_test_data = train_test_split(test_data, test_size=0.5, random_state=42)
#     else:
#         # Second split: Further divide test data into 50% local and 50% remote, again stratifying by 'verbose label'
#         local_test_data, remote_test_data = train_test_split(test_data, test_size=0.5, random_state=42, shuffle=True, stratify=test_data['verbose label'])

#     # Prepare datasets
#     train_data_for_clustering = train_data[['text']]
    
#     # Format local test data for evaluation
#     local_test_data_for_eval = local_test_data[['text', 'verbose label']].copy()
#     local_test_data_for_eval['verbose label'] = local_test_data_for_eval['verbose label'].str.split(':').str[0]  # Keep only the first part
#     #local_test_data_for_eval['text'] = local_test_data_for_eval['text'].str.strip('"')  # Strip quotes

#     remote_test_data_for_eval = remote_test_data[['text', 'verbose label']].copy()
#     remote_test_data_for_eval['verbose label'] = remote_test_data_for_eval['verbose label'].str.split(':').str[0]  # Keep only the first part
#     #remote_test_data_for_eval['text'] = remote_test_data_for_eval['text'].str.strip('"')  # Strip quotes

#     # Create output directory if it doesn't exist
#     os.makedirs(output_dir, exist_ok=True)

#     # Save the datasets to CSV files
#     train_data_file = os.path.join(output_dir, "swe_qaqc_prep_train.csv")
#     local_test_data_file = os.path.join(output_dir, "swe_qaqc_prep_local_test.csv")
#     remote_test_data_file = os.path.join(output_dir, "swe_qaqc_prep_remote_test.csv")

#     train_data_for_clustering.to_csv(train_data_file, index=False, header=True)
#     local_test_data_for_eval.to_csv(local_test_data_file, index=False, header=False)  # No header
#     remote_test_data_for_eval.to_csv(remote_test_data_file, index=False, header=False)  # No header

#     print(f"Data preprocessing complete. Files saved in: {output_dir}")


# # Example usage
# input_file_path = "data/QAQC/swe_qaqc_train.csv"  
# output_dir = "data/QAQC"  
# preprocess_and_split_data(input_file_path, output_dir)


# # -------------- gussy and my gpt usages ----------------

# import pandas as pd
# from sklearn.model_selection import train_test_split
# import os
# import re
# import csv

# def clean_text_and_extract_entity(data):
#     questions = data["text"].to_list()

#     questions = [re.sub(r'^(?:"(.*)"|\'(.*)\')$', r'\1\2', q).strip() for q in questions]
    
#     questions = [q.replace(',', '') for q in questions] 
    
#     # Extract the part before the colon in 'verbose label' and convert to uppercase
#     entity = list(map(lambda x: x.split(":")[0].upper(), data["verbose label"].to_list()))
    
#     # Zip the cleaned questions and entity labels together
#     zipped = list(zip(questions, entity))
    
#     return zipped

# def preprocess_and_split_data(input_file_path, output_dir):    

#     # Read the data
#     data = pd.read_csv(input_file_path, quoting=csv.QUOTE_NONE)

#     # First split: 80% training, 20% testing with stratification based on the 'verbose label'
#     train_data, test_data = train_test_split(data, test_size=0.2, random_state=42, shuffle=True, stratify=data['verbose label'])

#     # Second split: Further divide test data into 50% local and 50% remote, again stratifying by 'verbose label'
#     local_test_data, remote_test_data = train_test_split(test_data, test_size=0.5, random_state=42, shuffle=True, stratify=test_data['verbose label'])

#     # Apply the cleaning function to the training data for clustering (only keep questions)
#     train_data_for_clustering = train_data[['text']].copy()
#     train_data_for_clustering['text'] = train_data_for_clustering['text'].apply(lambda x: re.sub(r'^(?:"(.*)"|\'(.*)\')$', r'\1\2', x).strip())

#     # Apply the cleaning and label extraction to the local and remote test data
#     local_test_data_cleaned = clean_text_and_extract_entity(local_test_data)
#     remote_test_data_cleaned = clean_text_and_extract_entity(remote_test_data)

#     # Create output directory if it doesn't exist
#     os.makedirs(output_dir, exist_ok=True)

#     # Save the datasets to CSV files
#     train_data_file = os.path.join(output_dir, "swe_qaqc_prep_train.csv")
#     local_test_data_file = os.path.join(output_dir, "swe_qaqc_prep_local_test.csv")
#     remote_test_data_file = os.path.join(output_dir, "swe_qaqc_prep_remote_test.csv")

#     # Save the cleaned train data (for clustering) with just the questions
#     train_data_for_clustering.to_csv(train_data_file, index=False, header=True)

#     # Save the cleaned local and remote test data without headers
#     with open(local_test_data_file, mode='w', newline='', encoding='utf-8') as file:
#         writer = csv.writer(file)
#         writer.writerows(local_test_data_cleaned)

#     with open(remote_test_data_file, mode='w', newline='', encoding='utf-8') as file:
#         writer = csv.writer(file)
#         writer.writerows(remote_test_data_cleaned)

#     print(f"Data preprocessing complete. Files saved in: {output_dir}")

# # Example usage
# input_file_path = "data/QAQC/swe_qaqc_train.csv"  
# output_dir = "data/QAQC"  
# preprocess_and_split_data(input_file_path, output_dir)


#-------------- please work above ------------------------

# ------ KINDA WORKS CONTINUE ON THIS ONE YA DOG -------------


