# Инициализация

In [1]:
import numpy as np
import pandas as pd
import os

In [2]:
import sys

from google.colab import drive
drive.mount('/content/drive')

ROOT = "/content/drive/MyDrive/projects-ds/birds/"
sys.path.append(os.path.join(ROOT, 'code'))

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [3]:
from bird_logger import Logger, LogOperation, LOG
from bird_progress import ProgressBox

# Код загрузки оригинальных файлов и приведения их в нормализованный формат

In [4]:
def just_name(filename):
    return os.path.splitext(os.path.basename(filename))[0]

NUMERIC_COLUMNS = ["event-id", "location-long", "location-lat", "ground-speed", "heading", "height-above-msl"]
COLUMNS = [*NUMERIC_COLUMNS, 'timestamp']

def load_bird(filename):

    def detect_delimiter():
        with open(filename, encoding="unicode_escape") as f:
            first_line = f.readline()
        if ";" in first_line:
            return ";"
        else:
            return ","

    delim = detect_delimiter()

    df = pd.read_csv(filename, encoding="unicode_escape", usecols=COLUMNS, delimiter=delim, dtype="O", parse_dates=['timestamp'])
    LOG.operation_info(just_name(filename), "load", 0, df.shape)

    return df   

def format_bird(bird):
    bird[NUMERIC_COLUMNS] = bird[NUMERIC_COLUMNS].apply(lambda c: pd.to_numeric(c.str.replace(",",".")))
    bird.columns = bird.columns.str.replace("-", "_")

## Цикл обработки файлов исходного датасета

In [6]:
%%time

BIRD_DIR = os.path.join(ROOT, "datasets")
SAVE_DIR = os.path.join(ROOT, "results/01_format")
LOG_DIR = os.path.join(ROOT, "logs")

bird_files = sorted(os.listdir(BIRD_DIR))

disp = ProgressBox(stages={'format':'приводим к нужным форматам'}, total_files=len(bird_files))

!rm -r -f $SAVE_DIR/*.*

LOG = Logger(LOG_DIR, prefix="format_")

for i, file in enumerate(bird_files):
    disp.new_file(file, count=i+1)
    bird = load_bird(os.path.join(BIRD_DIR, file))

    disp.new_stage('format')
    format_bird(bird)
    
    disp.new_stage('save')
    bird.to_parquet(os.path.join(SAVE_DIR, f"{just_name(file)}.parquet"), index=False)

    disp.new_stage('ok')
    
LOG.flush()

CPU times: user 7min 39s, sys: 26.5 s, total: 8min 5s
Wall time: 10min 59s
