In [1]:
DATA_PATH = "/workspace/aip-for-ba/data"


In [None]:
# count number of files in a directory and its subdirectories
import pathlib


def count_files(directory):
    count = 0
    for path in pathlib.Path(directory).rglob('*'):
        if path.is_file():
            count += 1
    return count

if __name__ == '__main__':
    files_count = count_files('{DATA_PATH}/data_bratislava')
    print(f'Number of files: {files_count}')

In [None]:
import json
import requests as r

with open(f"{DATA_PATH}ingest_data_datasets.json", "r") as f:
    datasets = json.load(f)

for dataset in datasets:
    dataset_url = dataset['table']
    response = r.get(dataset_url)
    if response.status_code != 200:
        print(f'Error downloading data from {dataset_url}')
        continue

    try:
        rows = [row.get('attributes') for row in response.json().get('features')]
        with open(f'{DATA_PATH}data_bratislava/{dataset["title"]}.ndjson', 'w', encoding='utf-8') as f:
            for row in rows: 
                f.write(json.dumps(row, ensure_ascii=False) + '\n')
    except Exception:
        print(f'No features in {dataset_url}')
    

In [None]:
import polars as pl
data = pl.read_ndjson(f'{DATA_PATH}/data_bratislava_2/Emisie a merné územné emisie základných látok vypustených zo str.ndjson')
data.describe()

In [None]:
from ollama import Client
# cwith ontext size 12k

client = Client(host='http://192.168.0.199:11434',)


client.generate(model='llama3.1', options={"num_ctx": 12000}, prompt=f'Write description of this dataset, what columns it has, what values are there and the ranges: {data.describe()}')['response']

# print(response['message']['content'])

In [None]:
import json
with open(f'{DATA_PATH}/ingest_data_datasets.json', 'r') as f:
    json_data = json.load(f)

print(json_data)

In [None]:
updated_datasets = []


In [None]:
from pathlib import Path
for dataset in json_data:
    file_path = f'{DATA_PATH}/data_bratislava/{dataset["title"]}' + '.ndjson'
    if not Path(file_path).is_file() or Path(file_path).stat().st_size == 0:
        print(f'{dataset["title"]} not found')
        continue
    if dataset['title'] in [x['title'] for x in updated_datasets]:
        continue
    data = pl.read_ndjson(file_path)
    response = client.generate(model='llama3.1', options={"num_ctx": 12000}, prompt=f'Write description of this dataset, what columns it has, what values are there and the ranges: {data.describe()}')['response']
    updated_datasets.append({**dataset,**{'description': response}})


In [58]:
with open(f'{DATA_PATH}/ingest_data_datasets_ai_description.json', 'w') as f:
    json.dump(updated_datasets, f, indent=4)

In [12]:
from pathlib import Path


file_sizes = {}
for _file in Path(f'{DATA_PATH}/mib.sk').rglob('*'):
    if _file.is_file():
        file_sizes[_file] = _file.stat().st_size / 1024 / 1024


In [13]:
print(sum(file_sizes.values()))

47.91040897369385
