# Dataset Split
This notebook performs a split of the dataset into smaller or logical parts to facilitate database population or further processing.

## Library Imports

We import the necessary libraries for data analysis and preprocessing.

In [None]:
import pandas as pd
import csv
import os

## Dataset Loading
Load the dataset from a CSV file for analysis.

In [None]:
## Dataset Loading

# Define the relative path to the data file
data_path = '../data/raw/dataset.csv'

# Load the dataset
try:
    # Attempt to read with UTF-8 encoding first
    df = pd.read_csv(data_path, encoding='utf-8') 
except UnicodeDecodeError:
    try:
        # Fallback to Latin-1 encoding if UTF-8 fails
        df = pd.read_csv(data_path, encoding='latin1') 
    except Exception as e:
        print(f"Error loading CSV file: {e}")
        df = None  # Indicate failure by setting df to None

# Check if the dataset loaded correctly
if df is not None:
    print(f"Dataset loaded successfully. Shape: {df.shape}")
else:
    print("Error loading dataset. Please verify the file path and encoding.")


In [None]:
df.head()

# Extract Unique Values for Lookup Table

This step extracts unique combinations of identifiers and descriptive labels from the dataset, which can be used to populate a normalized lookup table in the database.

In [None]:
# Extract unique key-description pairs for lookup/reference tables
df_lookup = df[['col1', 'col2']].drop_duplicates()

df_lookup

# Export Cleaned Data to CSV
The resulting DataFrame is exported as a CSV file into the /data/data_split/ directory.

This allows for better modularity and simplifies loading the data into other systems such as relational databases or analytical pipelines.

In [None]:
df_lookup.to_csv(
    path_or_buf='../data/tables/table_lookup.csv', 
    sep=',',
    na_rep='',
    header=True,
    index=False,
    encoding='utf-8',
    quoting=csv.QUOTE_MINIMAL,
    lineterminator=os.linesep,
    quotechar='"',
    decimal='.',
    errors='strict'
)