# Dataset Collection: Kaggle Import
---

## Raw Dataset Import and Preprocessing

Raw datasets from the `./kaggledatasets` directory are imported into Pandas DataFrames, with non-numeric columns and missing values removed. Each dataset is truncated to a maximum length.

The cleaned collection is stored as:
- Dataset `dataset_sequences_200.pkl` (min length: 200, max length: 200)
    - A CSV view `dataset_sequences_200.csv` of the dataset
- Dataset `dataset_sequences_10k.pkl` (min length: 1, max length: 10,000)
    - A CSV view `dataset_sequences_10k.csv` of the dataset

⚠️ **Note:** Since downloaded Kaggle datasets may vary over time, reproducing the *exact* same DataFrames is not guaranteed. The `dataset_sequences_200.pkl` and `dataset_sequences_10k.pkl` datasets are provided for reproducibility.

---

## Requirements

In [None]:
%pip install pandas==2.2.3

## Kaggle Import

In [None]:
dataset_directory = './kaggledatasets'

In [None]:
import os
import pickle
import pandas as pd
import numpy as np

def kaggle_import(dataset_directory):
    print("Import kaggle data...")
    dataset_dfs = {}

    # import files in the directory
    for file_name in os.listdir(dataset_directory):
        try:
            dataset_name = os.path.splitext(file_name)[0]
            dataset_dfs[dataset_name] = pd.read_csv(os.path.join(dataset_directory, file_name))
        except pd.errors.ParserError as e:
            print(f"Error reading file '{file_name}': {e}")
            continue
        except UnicodeDecodeError as e:
            print(f"UnicodeDecodeError reading file '{file_name}': {e}")
            continue
        except pd.errors.EmptyDataError as e:
            print(f"EmptyDataError reading file '{file_name}': {e}")
            continue
        except IsADirectoryError as e:
            print(f"Error reading file '{file_name}': {e}")
            continue
    
    print("Amount of dataframes imported:", len(dataset_dfs))

    return dataset_dfs
    
def static_array_size(dataset_dfs, sequence_length):
    print("Clean up kaggle data...")
    
    #static array size
    for key, df in dataset_dfs.items():
        for column in df.columns:
            if not df[column].apply(lambda x: isinstance(x, (int, float))).all() or len(df[column].values) < sequence_length:
                df.drop(column, axis=1, inplace=True)
                
        dataset_dfs[key] = df.dropna().head(sequence_length)

def truncate_array_size(dataset_dfs, max_sequence_length):
    print("Clean up kaggle data...")
    
    #up to specific array size
    for key, df in dataset_dfs.items():
        for column in df.columns:
            if not df[column].apply(lambda x: isinstance(x, (int, float))).all():
                df.drop(column, axis=1, inplace=True)
                
        dataset_dfs[key] = df.dropna().head(max_sequence_length)

def save_pickle(dataset_dfs, dataset_import):
    # Save the dictionary to a file
    with open(f'{dataset_import}.pkl', 'wb') as f:
        pickle.dump(dataset_dfs, f)

def save_csv(dataset_dfs, dataset_import):
    print("Building CSV view...")
    rows = []
    
    for ds_name, df in dataset_dfs.items():
        for col in df.columns:
            series = df[col]
            
            # Row: dataset#column as source
            row = {"_source": f"{ds_name}#{col}"}
    
            # Add length (number of valid entries)
            row["_length"] = len(series)
    
            # Fill values with column index naming (_0, _1, ...)
            for i, val in enumerate(series):
                row[f"_{i}"] = val
    
            rows.append(row)
    
    wide_df = pd.DataFrame(rows)
    
    # Sort by _length descending (longest original columns first)
    wide_df.sort_values(by="_length", ascending=False, inplace=True)
    wide_df.reset_index(drop=True, inplace=True)
    
    print(f"Created CSV view: {len(wide_df)} rows, {wide_df.shape[1]} columns")
    wide_df.to_csv(f'{dataset_import}.csv', index=False)
    print(f'Saved {dataset_import}.csv')

## Create Dataset `dataset_sequences_200.pkl` (min length: 200, max length: 200)

In [None]:
dataset_sequences_200 = kaggle_import(dataset_directory)
static_array_size(dataset_sequences_200, 200)
save_pickle(dataset_sequences_200, 'dataset_sequences_200')

## Import Dataset `dataset_sequences_10k.pkl` (min length: 1, max length: 10,000)

In [None]:
dataset_sequences_10k = kaggle_import(dataset_directory)
truncate_array_size(dataset_sequences_10k, 10000)
save_pickle(dataset_sequences_10k, 'dataset_sequences_10k')

## (Optional) Save as CSV View

In [None]:
save_csv(dataset_sequences_200, 'dataset_sequences_200')
save_csv(dataset_sequences_10k, 'dataset_sequences_10k')