In [1]:
# Импорт библиотек
import os
import zipfile
from pathlib import Path
from datetime import datetime
import pandas as pd
from kaggle.api.kaggle_api_extended import KaggleApi

In [2]:
# Настройка путей
BASE_DIR = Path.cwd().parent
DATA_RAW = BASE_DIR / 'data' / 'raw'
DATA_PROCESSED = BASE_DIR / 'data' / 'processed'

In [3]:
# Создание директорий
DATA_RAW.mkdir(parents=True, exist_ok=True)
DATA_PROCESSED.mkdir(parents=True, exist_ok=True)

In [4]:
def download_kaggle_dataset(dataset, path):
    """Скачать датасет с Kaggle"""
    from kaggle.api.kaggle_api_extended import KaggleApi
    
    api = KaggleApi()
    api.authenticate()
    
    # Разделение на владельца и название
    owner, dataset_name = dataset.split('/')
    
    # Скачивание
    api.dataset_download_files(dataset, path=path, unzip=False)
    
    return path / f"{dataset_name}.zip"

In [5]:
# Скачивание датасетов
world_events_zip = download_kaggle_dataset(
    'saketk511/world-important-events-ancient-to-modern',
    DATA_RAW / 'world_events'
)

car_accident_zip = download_kaggle_dataset(
    'nextmillionaire/car-accident-dataset',
    DATA_RAW / 'car_accident'
)

Dataset URL: https://www.kaggle.com/datasets/saketk511/world-important-events-ancient-to-modern


Dataset URL: https://www.kaggle.com/datasets/nextmillionaire/car-accident-dataset


In [6]:
def extract_zip(zip_path, extract_to):
    """Распаковать архив"""
    with zipfile.ZipFile(zip_path, 'r') as zip_ref:
        zip_ref.extractall(extract_to)
    return extract_to

# Распаковка
world_events_dir = extract_zip(world_events_zip, DATA_RAW / 'world_events')
car_accident_dir = extract_zip(car_accident_zip, DATA_RAW / 'car_accident')

In [7]:
def normalize_data(df):
    """Нормализация данных"""
    # Приведение названий колонок
    df.columns = (
        df.columns
        .str.strip()
        .str.lower()
        .str.replace(r'[\s\-\.]+', '_', regex=True)
        .str.replace(r'[^a-z0-9_]', '', regex=True)
        .str.strip('_')
    )
    
    # Конвертация дат
    for col in df.columns:
        if 'date' in col or 'time' in col:
            try:
                df[col] = pd.to_datetime(df[col]).dt.strftime('%Y-%m-%d')
            except:
                continue
    
    # Очистка
    df = df.replace(r'^\s*$', None, regex=True)
    df = df.drop_duplicates()
    
    return df

# Обработка исторических событий
world_events = pd.read_csv(next(world_events_dir.glob('*.csv')))
world_events = normalize_data(world_events)
world_events.to_parquet(
    DATA_PROCESSED / 'world_events.parquet',
    engine='pyarrow',
    compression='snappy'
)

# Обработка данных об авариях
car_accident = pd.read_csv(next(car_accident_dir.glob('*.csv')))
car_accident = normalize_data(car_accident)
car_accident.to_parquet(
    DATA_PROCESSED / 'car_accident.parquet',
    engine='pyarrow',
    compression='snappy'
)

  df[col] = pd.to_datetime(df[col]).dt.strftime('%Y-%m-%d')


  df[col] = pd.to_datetime(df[col]).dt.strftime('%Y-%m-%d')
