# 01 - Data Preprocessing

This notebook handles:
- Mounting Google Drive (if applicable)
- Extracting and loading the dataset
- Listing and organizing the files
- Preliminary checks on structure and formats

In [None]:
# Mount Google Drive if using Colab (comment out if running locally)
from google.colab import drive
drive.mount('/content/drive', force_remount=True)

In [None]:
# Define and unzip the dataset path
import zipfile

zip_path = '/content/drive/My Drive/HACKATON HERA/wetransfer_inrete-ai-data-hackathon_2024-11-26_0803.zip'
extract_path = '/content/InRete_Data'

with zipfile.ZipFile(zip_path, 'r') as zip_ref:
    zip_ref.extractall(extract_path)

In [None]:
# List all files in the dataset
import os

def list_all_files(folder_path):
    file_paths = []
    for root, dirs, files in os.walk(folder_path):
        for file in files:
            file_paths.append(os.path.join(root, file))
    return file_paths

folder_path = extract_path
all_files = list_all_files(folder_path)

print('Found files:')
for f in all_files:
    print(f)

In [None]:
# Filter for relevant files
tratte_disp = [p for p in all_files if 'tratte_disp' in p]
tratte_gas = [p for p in all_files if 'tratte_gas' in p]
rischio = [p for p in all_files if 'part' in p]

print('\nTratte Dispersione:')
for f in tratte_disp:
    print(f)

print('\nTratte Gas:')
for f in tratte_gas:
    print(f)

print('\nRischio:')
for f in rischio:
    print(f)

In [None]:
# Load data into memory
import pandas as pd
import geopandas as gpd

tratte_disp_df = pd.concat([pd.read_parquet(t) for t in tratte_disp], ignore_index=True)
tratte_gas_df = [gpd.read_parquet(t) for t in tratte_gas]
rischio_df = pd.concat([pd.read_parquet(t) for t in rischio], ignore_index=True)