# Create knowledge based on all tables


In [1]:
import json
import requests


Get all tabels: 
https://data.qa.ssb.no/pxapi2-beta/api/v2/tables/08456?lang=no


In [2]:


def hent_data(url):
    # Sender en GET-forespørsel til URL-en
    respons = requests.get(url)

    # Sjekker om forespørselen var vellykket
    if respons.status_code == 200:
        # Returnerer JSON-innholdet i responsen
        return respons.json()
    else:
        # Returnerer en feilmelding hvis forespørselen mislykkes
        return f"Feil i forespørsel: Statuskode {respons.status_code}"

# URL til API-endepunktet
url = "https://data.qa.ssb.no/pxapi2-beta/api/v2/tables?pageSize=100000"

# Kaller funksjonen og skriver ut resultatet
result = hent_data(url)



In [3]:
# Parse the data
data = result['tables']

In [4]:
# Assuming 'data' is your JSON string


# Transform data
transformed_data = []
for item in data: 
    new_item = {
        "variableNames": item["variableNames"],
        "label": item["label"]
    }
    transformed_data.append(new_item)

In [5]:
file_name = 'vocabulary.csv'

with open(file_name, 'w', encoding='utf-8') as file:
    for item in transformed_data:
        # Split the label at the first colon
        parts = item['label'].split(':', 1)
        # number = parts[0].strip().lower()  # The initial number part, converted to lowercase
        label_text = parts[1].strip().lower() if len(parts) > 1 else ''  # The rest of the label, converted to lowercase

        # Split and rejoin the label text and variable names by spaces, convert to lowercase
        # Filter out any empty strings resulting from consecutive spaces
        label_text = ', '.join(filter(None, label_text.split()))
        variable_names_str = ', '.join(', '.join(filter(None, name.lower().split())) for name in item['variableNames'])  # Convert to lowercase

        # Create a CSV line with the number, label text, and variable names
        csv_line = f"{label_text}, {variable_names_str}\n"
        
        file.write(csv_line)


In [6]:
file_name = 'vocabulary.csv'

# Step 1: Read the file and get all lines
with open(file_name, 'r', encoding='utf-8') as file:
    lines = file.readlines()

# A set to keep track of unique values seen so far
seen_values = set()

# Step 2: Process each line
processed_lines = []
for line in lines:
    unique_values = []
    for value in line.strip().split(','):
        value = value.strip()  # Remove whitespace around the value
        if value not in seen_values:
            seen_values.add(value)
            unique_values.append(value)
    processed_line = ', '.join(unique_values) + '\n'
    processed_lines.append(processed_line)

# Step 3: Write the processed lines back to the file
with open(file_name, 'w', encoding='utf-8') as file:
    file.writelines(processed_lines)


In [7]:
file_name = 'vocabulary.csv'  # Replace with your actual file name

# Read the file content
with open(file_name, 'r', encoding='utf-8') as file:
    content = file.read()

# Remove new lines and spaces
content = content.replace('\n', '').replace(' ', '')

# Write the processed content back to the file
with open(file_name, 'w', encoding='utf-8') as file:
    file.write(content)


In [8]:
from docx import Document
import csv
import os

def csv_to_docx(csv_file_path, docx_file_path):
    # Sjekker om mappen der docx-filen skal lagres eksisterer
    docx_folder = os.path.dirname(docx_file_path)
    if not os.path.exists(docx_folder):
        os.makedirs(docx_folder)

    # Oppretter et nytt Document-objekt
    doc = Document()

    # Åpner CSV-filen og leser innholdet
    with open(csv_file_path, newline='') as file:
        reader = csv.reader(file)
        for row in reader:
            # Legger til hver rad fra CSV i dokumentet som en ny paragraf
            doc.add_paragraph(', '.join(row))

    # Lagrer dokumentet som en .docx-fil
    doc.save(docx_file_path)

# Eksempel på bruk av funksjonen
csv_file_path = 'vocabulary.csv'  # Sett inn riktig filsti til din CSV-fil
docx_file_path = '../../Knowledge/tablesVocabulary.docx'  # Navn på den genererte docx-filen

csv_to_docx(csv_file_path, docx_file_path)


In [10]:
import csv
import os

def csv_to_txt(csv_file_path, txt_file_path):
    # Checks if the folder where the txt file will be saved exists
    txt_folder = os.path.dirname(txt_file_path)
    if not os.path.exists(txt_folder):
        os.makedirs(txt_folder)

    # Opens the CSV file and reads its contents
    with open(csv_file_path, newline='', encoding='utf-8') as file:
        reader = csv.reader(file)

        # Opens the TXT file for writing in UTF-16
        with open(txt_file_path, 'w', encoding='utf-8') as txt_file:
            for row in reader:
                # Writes each row from the CSV to the text file as a new line
                txt_file.write(', '.join(row) + '\n')

# Example usage of the function
csv_file_path = 'vocabulary.csv'  # Replace with the correct path to your CSV file
txt_file_path = '../../Knowledge/tablesVocabulary.txt'  # Name of the generated txt file

csv_to_txt(csv_file_path, txt_file_path)
