In [6]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

In [188]:
import os
import logging

# Set up logging to record errors
logging.basicConfig(
    level=logging.INFO,
    format='%(asctime)s - %(levelname)s - %(message)s',
    filename='data_loading_errors.log'  # Log to a file
)

# Base directory containing the folders
base_dir = "processed"  # Change this to your actual base directory

# Dictionary to store dataframes with folder names as keys
all_data = {}
error_folders = []  # List to keep track of folders with errors

# Function to read data files with error handling
def read_data_file(file_path):
    try:
        if file_path.endswith('.csv'):
            return pd.read_csv(file_path)
        elif file_path.endswith('.tsv'):
            return pd.read_csv(file_path, sep='\t')
        return None
    except Exception as e:
        logging.error(f"Error reading {file_path}: {str(e)}")
        return None

# Loop through all folders in the base directory
for folder_name in os.listdir(base_dir):
    try:
        folder_path = os.path.join(base_dir, folder_name)
        
        # Skip .ipynb_checkpoints directory
        if folder_name == '.ipynb_checkpoints':
            continue
        
        # Check if it's a directory
        if os.path.isdir(folder_path):
            # Try CSV first
            csv_path = os.path.join(folder_path, f"{folder_name}.csv")
            tsv_path = os.path.join(folder_path, f"{folder_name}.tsv")
            
            # Try to read the CSV file
            df = read_data_file(csv_path)
            
            # If CSV doesn't exist or couldn't be read, try TSV
            if df is None:
                df = read_data_file(tsv_path)
            
            # If we successfully loaded data, add it to our dictionary
            if df is not None:
                all_data[folder_name] = df
                print(f"Successfully loaded data from {folder_name}")
                new_path = os.path.join(folder_path, f"{folder_name}.parquet")

                if os.path.exists(csv_path):
                    os.remove(csv_path)
                    print("File deleted.")
                else:
                    print("File not found.")

                if os.path.exists(tsv_path):
                    os.remove(tsv_path)
                    print("File deleted.")
                else:
                    print("File not found.")
                
                print(f"Successfully saved data as {folder_name}.parquet")
                df.to_parquet(new_path, index = False)
                
            else:
                print(f"No data file found for {folder_name}")
                error_folders.append(folder_name)
    except Exception as e:
        # Log the error and continue with the next folder
        error_message = f"Error processing folder {folder_name}: {str(e)}"
        print(error_message)
        logging.error(error_message)
        error_folders.append(folder_name)

# Now all_data contains dataframes for each folder that had a valid CSV or TSV file
print(f"Loaded data from {len(all_data)} folders")
print(f"Encountered errors in {len(error_folders)} folders")
for error_folder in error_folders:
    print(error_folder)

print("==" * 20)

Successfully loaded data from sulfur_2
File deleted.
File not found.
Successfully saved data as sulfur_2.parquet
Successfully loaded data from 537_houses
File not found.
File deleted.
Successfully saved data as 537_houses.parquet
Successfully loaded data from keggdirected
File deleted.
File not found.
Successfully saved data as keggdirected.parquet
Successfully loaded data from bike
File deleted.
File not found.
Successfully saved data as bike.parquet
Successfully loaded data from protein
File deleted.
File not found.
Successfully saved data as protein.parquet
Successfully loaded data from 218_house_8L
File not found.
File deleted.
Successfully saved data as 218_house_8L.parquet
Successfully loaded data from 574_house_16H
File not found.
File deleted.
Successfully saved data as 574_house_16H.parquet
Successfully loaded data from kin40k
File deleted.
File not found.
Successfully saved data as kin40k.parquet
Successfully loaded data from sulfur_1
File deleted.
File not found.
Successfull

In [155]:
for name, df in all_data.items():
    print(name)
    display(df.isna().sum().sum())
    print("==" * 30)

MiamiHousing2016


0

nyc_taxi_green_dec_2016


0

CBM_2


0

diamonds


0

books


0

gas_turbine_co_and_nox_emission


0

elevators


0

house_sales


0

medical_charges


0

house


0

CASP


0

CBM_1


0

california


0

players_22


0



In [159]:
path = "raw/delays_zurich_transport/delays_zurich_transport_part"

In [171]:

import glob

files = sorted(glob.glob("raw/houseelectric/delays_zurich_transport_part_0*.csv"))

print(files)

df_list = []
for f in files:
    print("Reading:", f)
    df_list.append(pd.read_csv(f))

df = pd.concat(df_list, ignore_index=True)

df.to_parquet("raw/delays_zurich_transport/delays_zurich_transport.parquet", index=False)
print("Saved parquet successfully!")


['raw/delays_zurich_transport/delays_zurich_transport_part_00.csv', 'raw/delays_zurich_transport/delays_zurich_transport_part_01.csv', 'raw/delays_zurich_transport/delays_zurich_transport_part_02.csv', 'raw/delays_zurich_transport/delays_zurich_transport_part_03.csv']
Reading: raw/delays_zurich_transport/delays_zurich_transport_part_00.csv
Reading: raw/delays_zurich_transport/delays_zurich_transport_part_01.csv
Reading: raw/delays_zurich_transport/delays_zurich_transport_part_02.csv
Reading: raw/delays_zurich_transport/delays_zurich_transport_part_03.csv
Saved parquet successfully!


In [165]:
df = pd.concat([df1, df2], axis = 0)

In [182]:
# Import the scipy.io module which has arff reader functionality
from scipy.io import arff

# Read the ARFF file
path = "raw/houseelectric/houseelectric.csv"
# data, meta = arff.loadarff(path)
data = pd.read_csv(path)

# Convert to pandas DataFrame
df = pd.DataFrame(data)

# If the DataFrame contains byte strings, convert them to regular strings
# This is often needed with ARFF files
for col in df.select_dtypes([object]).columns:
    df[col] = df[col].str.decode('utf-8')

df.columns = ["X1", "X2", "X3", "target"]
df

Unnamed: 0,X1,X2,X3,target
0,1.281000,0.92971,0.71251,2.0305
1,1.256600,0.46466,0.30651,-7.4101
2,-1.406100,1.24010,1.25910,-6.2772
3,1.398600,0.42940,-1.51610,75.3230
4,0.795180,-0.84711,-0.14732,-20.0210
...,...,...,...,...
434868,1.565700,-2.15610,-0.58176,-5.6264
434869,0.091598,1.02420,1.02790,45.7430
434870,-1.610600,-0.24423,0.54517,-13.4930
434871,-0.072314,0.22385,-1.07350,50.6210


In [183]:
path = "raw/houseelectric/houseelectric.parquet"

In [184]:
df.to_parquet(path, index = False)

In [185]:
new_df = pd.read_parquet(path)
new_df

Unnamed: 0,X1,X2,X3,target
0,1.281000,0.92971,0.71251,2.0305
1,1.256600,0.46466,0.30651,-7.4101
2,-1.406100,1.24010,1.25910,-6.2772
3,1.398600,0.42940,-1.51610,75.3230
4,0.795180,-0.84711,-0.14732,-20.0210
...,...,...,...,...
434868,1.565700,-2.15610,-0.58176,-5.6264
434869,0.091598,1.02420,1.02790,45.7430
434870,-1.610600,-0.24423,0.54517,-13.4930
434871,-0.072314,0.22385,-1.07350,50.6210


In [189]:
path = "processed/taxi/taxi.parquet"
df = pd.read_parquet(path)
df

Unnamed: 0,domain,minute_oftheday_sin,minute_oftheday_cos,hour_sin,hour_cos,week_sin,week_cos,month_sin,month_cos,pickup_longitude,pickup_latitude,dropoff_longitude,dropoff_latitude,trip_duration
0,0,-0.156434,-0.156434,-2.588190e-01,-2.588190e-01,-0.900969,-0.900969,6.123234e-17,6.123234e-17,-0.411012,0.452977,-0.410915,0.452951,0.704884
1,0,0.982450,0.982450,1.000000e+00,1.000000e+00,-0.222521,-0.222521,-1.000000e+00,-1.000000e+00,-0.411002,0.452651,-0.411108,0.452568,0.748243
2,0,-0.994056,-0.994056,-9.659258e-01,-9.659258e-01,-0.900969,-0.900969,8.660254e-01,8.660254e-01,-0.410995,0.452933,-0.411141,0.452334,0.882335
3,0,0.390731,0.390731,2.588190e-01,2.588190e-01,1.000000,1.000000,-5.000000e-01,-5.000000e-01,-0.411167,0.452444,-0.411179,0.452297,0.698107
4,0,-0.923880,-0.923880,-9.659258e-01,-9.659258e-01,-0.222521,-0.222521,6.123234e-17,6.123234e-17,-0.410961,0.453258,-0.410961,0.453139,0.699706
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1498808,3,-0.548293,-0.548293,-5.000000e-01,-5.000000e-01,0.623490,0.623490,5.000000e-01,5.000000e-01,-0.436441,-0.003111,-0.436412,-0.003499,0.773141
1498809,3,-0.434445,-0.434445,-2.588190e-01,-2.588190e-01,0.623490,0.623490,5.000000e-01,5.000000e-01,-0.436119,-0.002278,-0.435722,-0.003657,0.959005
1498810,3,-0.629320,-0.629320,-5.000000e-01,-5.000000e-01,0.623490,0.623490,5.000000e-01,5.000000e-01,-0.436030,-0.001942,-0.436040,-0.001951,0.906989
1498811,3,0.267238,0.267238,5.000000e-01,5.000000e-01,0.623490,0.623490,5.000000e-01,5.000000e-01,-0.436390,-0.002853,-0.436391,-0.002853,0.519522


In [191]:
df.isna().sum().sum()

0