# Dataset Exploration

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import matplotlib.colors as mcolors
from pathlib import Path

DATA_PATH = Path('../data/Challenge2.csv')

In [2]:
df_raw = pd.read_csv(DATA_PATH)
df_raw.head()

Unnamed: 0,depth,col1,col2,col3,col4,col5,col6,col7,col8,col9,...,col191,col192,col193,col194,col195,col196,col197,col198,col199,col200
0,9000.1,224.0,224.0,224.0,224.0,224.0,224.0,224.0,224.0,224.0,...,224.0,225.0,226.0,225.0,224.0,223.0,221.0,220.0,217.0,217.0
1,9000.2,224.0,224.0,224.0,224.0,224.0,224.0,224.0,224.0,224.0,...,224.0,225.0,226.0,225.0,224.0,223.0,222.0,221.0,219.0,219.0
2,9000.3,224.0,224.0,224.0,224.0,224.0,224.0,224.0,224.0,224.0,...,224.0,225.0,226.0,226.0,225.0,224.0,224.0,223.0,223.0,223.0
3,9000.4,224.0,224.0,224.0,224.0,224.0,224.0,224.0,224.0,224.0,...,224.0,225.0,226.0,226.0,225.0,225.0,225.0,225.0,225.0,225.0
4,9000.5,224.0,224.0,224.0,224.0,224.0,224.0,224.0,224.0,224.0,...,225.0,224.0,223.0,223.0,224.0,224.0,225.0,225.0,225.0,225.0


*Looks like Line scan camera image, values at different depths*

In [5]:
# how big is the file?
print(f'shape: {df_raw.shape}')

shape: (5461, 201)


In [6]:
#check for total null cells, rows with null cells
print(f'Total null cells: {df_raw.isnull().sum().sum()}')
print(f'Rows with null cells: {df_raw.isnull().any(axis=1).sum()}')


Total null cells: 201
Rows with null cells: 1


*only one null row*

In [12]:
# Null row content
null_rows = df_raw[df_raw.isnull().any(axis=1)]
#all NaN values in null rows
print(null_rows)
print(sum(~(df_raw[null_rows].isnull().all(axis=1).values)))

      depth  col1  col2  col3  col4  col5  col6  col7  col8  col9  ...  \
5460    NaN   NaN   NaN   NaN   NaN   NaN   NaN   NaN   NaN   NaN  ...   

      col191  col192  col193  col194  col195  col196  col197  col198  col199  \
5460     NaN     NaN     NaN     NaN     NaN     NaN     NaN     NaN     NaN   

      col200  
5460     NaN  

[1 rows x 201 columns]
0


*We can safely drop it since it is the last trailing one, no impact assuming data is from a line scan kind*

In [13]:
df = df_raw.dropna().reset_index(drop=True)

In [14]:
# Extract depth and pixel arrays
depths = df['depth'].values
pixel_cols = [c for c in df.columns if c != 'depth']
pixels = df[pixel_cols].values

print(f'Depth array shape: {depths.shape}')
print(f'Pixel array shape: {pixels.shape}')
print(f'Pixel dtype: {pixels.dtype}')

Depth array shape: (5460,)
Pixel array shape: (5460, 200)
Pixel dtype: float64


## Depth data analysis

In [15]:
print(f'Depth range: {depths.min():.1f} to {depths.max():.1f}')
print(f'Total depth interval: {depths.max() - depths.min():.1f} units')
print(f'Number of samples: {len(depths)}')

Depth range: 9000.1 to 9546.0
Total depth interval: 545.9 units
Number of samples: 5460


In [16]:
# Looking at Sampling interval details
depth_diffs = np.diff(depths)
print(f'\nSampling interval:')
print(f'  min:    {depth_diffs.min():.4f}')
print(f'  max:    {depth_diffs.max():.4f}')
print(f'  mean:   {depth_diffs.mean():.4f}')
print(f'  std:    {depth_diffs.std():.6f}')
print(f'  unique: {np.unique(np.round(depth_diffs, 2))}')


Sampling interval:
  min:    0.1000
  max:    0.1000
  mean:   0.1000
  std:    0.000000
  unique: [0.1]


In [18]:
np.all(depth_diffs > 0)

np.True_

In [20]:
len(np.unique(np.round(depth_diffs, 2))) == 1

True

Some Conclusion: 
- Uniform 0.1 step, monotonic, no gaps.
- This means depth can serve as a clean PRIMARY KEY.
- Range queries on an indexed depth column will be O(log n) + O(k).