In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

# Load the data
train = pd.read_csv('/home/data/train.csv')
test = pd.read_csv('/home/data/test.csv')

print("Train shape:", train.shape)
print("Test shape:", test.shape)
print("\nTrain columns:", train.columns.tolist())
print("\nTest columns:", test.columns.tolist())
print("\nTrain dtypes:")
print(train.dtypes)

Train shape: (5432400, 8)
Test shape: (603600, 7)

Train columns: ['id', 'breath_id', 'R', 'C', 'time_step', 'u_in', 'u_out', 'pressure']

Test columns: ['id', 'breath_id', 'R', 'C', 'time_step', 'u_in', 'u_out']

Train dtypes:
id             int64
breath_id      int64
R              int64
C              int64
time_step    float64
u_in         float64
u_out          int64
pressure     float64
dtype: object


In [2]:
# Basic statistics
print("Target variable (pressure) statistics:")
print(train['pressure'].describe())

print("\nUnique breaths in train:", train['breath_id'].nunique())
print("Unique breaths in test:", test['breath_id'].nunique())

print("\nR values distribution:")
print(train['R'].value_counts().sort_index())

print("\nC values distribution:")
print(train['C'].value_counts().sort_index())

Target variable (pressure) statistics:
count    5.432400e+06
mean     1.121807e+01
std      8.106474e+00
min     -1.895744e+00
25%      6.329607e+00
50%      7.032628e+00
75%      1.364103e+01
max      6.482099e+01
Name: pressure, dtype: float64



Unique breaths in train: 67905
Unique breaths in test: 7545

R values distribution:
R
5     1791200
20    1474080
50    2167120
Name: count, dtype: int64

C values distribution:
C
10    2017520
20    1640160
50    1774720
Name: count, dtype: int64


In [None]:
# Look at a single breath to understand the time series pattern
sample_breath = train[train['breath_id'] == train['breath_id'].iloc[0]]
print("Sample breath shape:", sample_breath.shape)
print("\nFirst few rows:")
print(sample_breath.head(10))

print("\nTime step range:", sample_breath['time_step'].min(), "to", sample_breath['time_step'].max())
print("Pressure range for this breath:", sample_breath['pressure'].min(), "to", sample_breath['pressure'].max())