# Dataset Loading and Inspection

Dataset Title: <title>
Dataset Subtitle: <subtitle>
Description: <description>

Problem: Build a predictive model to identify the target variable in this tabular dataset. The goal of this notebook is to load the data, inspect its structure, identify the target, and prepare a concise dataset summary for downstream modeling.

In [None]:
import pandas as pd
import numpy as np
import os
import io
import json
import sys
from io import StringIO

pd.set_option('display.max_columns', None)
from IPython.display import display

In [None]:
# Dataset metadata (fill with actual values from the provided dataset metadata)
dataset_metadata = {
    'Title': '<title>',
    'Subtitle': '<subtitle>',
    'Description': '<description>',
    'Dataset files': ['<datasets>']
}

# Normalize to list
files = dataset_metadata.get('Dataset files', [])
if isinstance(files, str):
    files = [files]
train_path = None
test_path = None
if isinstance(files, list) and len(files) > 0:
    train_path = files[0]
    if len(files) > 1:
        test_path = files[1]

def load_dataset_from_paths(paths):
    if not paths:
        raise ValueError('No dataset paths provided.')
    last_err = None
    for p in paths:
        try:
            if not isinstance(p, str) or p.strip() == '':
                continue
            if not os.path.exists(p):
                print(f'Warning: path not found: {p}')
                continue
            ext = os.path.splitext(p)[1].lower()
            if ext in ['.csv', '.tsv', '.txt']:
                return pd.read_csv(p)
            elif ext in ['.xlsx', '.xls']:
                return pd.read_excel(p)
            elif ext in ['.json']:
                return pd.read_json(p)
            else:
                return pd.read_csv(p)
        except Exception as e:
            last_err = e
            print(f'Error reading {p}: {e}')
            continue
    raise FileNotFoundError('None of the dataset files could be loaded. Last error: ' + str(last_err))

# Load datasets
df = None
test_df = None
try:
    if train_path:
        df = load_dataset_from_paths([train_path])
    else:
        raise ValueError('No training dataset file specified.')
except Exception as e:
    print('Failed to load training dataset:', e)

if test_path:
    try:
        test_df = load_dataset_from_paths([test_path])
        print('Validation dataset loaded:', test_df.shape)
    except Exception as e:
        print('Failed to load validation dataset:', e)

if df is not None:
    print('Training dataset loaded:', df.shape)
else:
    print('Training dataset not loaded.')

In [None]:
if df is not None:
    print('Training dataset shape:', df.shape)
    print('Columns:', df.columns.tolist())
    print('Data types:\n', df.dtypes)
    missing_per_column = df.isnull().sum()
    print('Missing values per column:\n', missing_per_column)

    # Target detection
    target_candidates = ['target','label','class','y']
    target_col = None
    for c in df.columns:
        if str(c).lower() in target_candidates:
            target_col = c
            break
    if target_col is None:
        target_col = df.columns[-1]
    print('Identified target column:', target_col)

    print('First few rows:')
    display(df.head())

    print('Statistical summary (all columns):')
    display(df.describe(include='all'))

    # Build dataset summary
    import io
    buf = io.StringIO()
    df.info(buf=buf)
    info_str = buf.getvalue()

    dataset_summary = {
        'shape': df.shape,
        'columns': df.shape[1],
        'missing_values': bool(df.isnull().any().any()),
        'sample_data': df.head().to_dict(orient='records'),
        'statistical_summary': df.describe(include='all').to_dict(),
        'info': info_str
    }
else:
    dataset_summary = {
        'shape': None,
        'columns': 0,
        'missing_values': False,
        'sample_data': [],
        'statistical_summary': {},
        'info': ''
    }

print('Dataset summary prepared.')
print(dataset_summary)

In [None]:
def detect_device():
    device = 'cpu'
    # PyTorch check
    try:
        import torch
        if torch.cuda.is_available():
            device = 'cuda'
            return device
    except Exception:
        pass
    # TensorFlow check
    try:
        import tensorflow as tf
        gpus = tf.config.list_physical_devices('GPU')
        if gpus:
            device = 'cuda'
            return device
    except Exception:
        pass
    return device

device = detect_device()
print('Detected device:', device)

# Configure environment to use GPU if available
if device == 'cuda':
    import os
    os.environ['CUDA_VISIBLE_DEVICES'] = '0'

# Update dataset_summary with device info if available
try:
    dataset_summary['device'] = device
except NameError:
    dataset_summary = {'device': device}

print('Device configured for future ML tasks:', device)

- The dataset has been loaded and examined.
- A summary dictionary named dataset_summary has been created containing shape, column count, missing values flag, a sample of the data, a statistical description, and the dataset info.

In [None]:
# Display the final dataset summary as JSON for easy use in ML pipelines
import json
print(json.dumps(dataset_summary, indent=2))