In [1]:
import pandas as pd
import numpy as np
from google.colab import files


In [6]:
#Upload from your local machine
uploaded = files.upload()
file_name = next(iter(uploaded))  # Gets the first uploaded filename
print(f"Uploaded: {file_name}")

# Load the dataset - CORRECTED VERSION
df = pd.read_excel(file_name)  # Changed from read_xlsx to read_excel
print(f"Shape: {df.shape}")
df.head()

Saving Driver Physiological.xlsx to Driver Physiological (3).xlsx
Uploaded: Driver Physiological (3).xlsx
Shape: (1230, 6)


Unnamed: 0,Body_Temperature,Heart_Rate,SPO2,Driver_State,Unnamed: 4,Unnamed: 5
0,95.5,54,85,5,,0-3 : Normal
1,95.9,54,85,5,,4-6 : discomfort
2,96.1,54,85,5,,7-8 : Stress
3,100.22,54,99,5,,9: Red Alert
4,100.1,54,79,7,,


In [7]:
# Basic info
print("=== Data Types ===")
print(df.dtypes)

print("\n=== Missing Values ===")
print(df.isnull().sum())

# Quick stats
df.describe(include='all')

=== Data Types ===
Body_Temperature    float64
Heart_Rate            int64
SPO2                  int64
Driver_State          int64
Unnamed: 4          float64
Unnamed: 5           object
dtype: object

=== Missing Values ===
Body_Temperature       0
Heart_Rate             0
SPO2                   0
Driver_State           0
Unnamed: 4          1230
Unnamed: 5          1226
dtype: int64


Unnamed: 0,Body_Temperature,Heart_Rate,SPO2,Driver_State,Unnamed: 4,Unnamed: 5
count,1230.0,1230.0,1230.0,1230.0,0.0,4
unique,,,,,,4
top,,,,,,0-3 : Normal
freq,,,,,,1
mean,99.43235,74.349593,80.031707,6.986179,,
std,2.617127,9.697528,11.882314,2.180012,,
min,94.0,54.0,60.0,0.0,,
25%,97.16,67.0,70.0,6.0,,
50%,99.32,74.0,80.0,7.0,,
75%,101.66,81.0,90.0,9.0,,


In [8]:
df = df.drop(columns=['Unnamed: 4', 'Unnamed: 5'], axis=1)

# Verify remaining missing values
print("\nMissing Values After Cleaning:")
print(df.isnull().sum())


Missing Values After Cleaning:
Body_Temperature    0
Heart_Rate          0
SPO2                0
Driver_State        0
dtype: int64


In [9]:
# Convert Driver_State to categorical (if it represents states)
df['Driver_State'] = df['Driver_State'].astype('category')

In [10]:
# Check for biologically impossible values
print("\nValue Ranges:")
print(f"Body Temperature: {df['Body_Temperature'].min()}°F - {df['Body_Temperature'].max()}°F")
print(f"Heart Rate: {df['Heart_Rate'].min()} - {df['Heart_Rate'].max()} bpm")
print(f"SPO2: {df['SPO2'].min()}% - {df['SPO2'].max()}%")


Value Ranges:
Body Temperature: 94.0°F - 104.5°F
Heart Rate: 54 - 109 bpm
SPO2: 60% - 100%


In [11]:
# Handle any outliers (example for body temperature)
df = df[(df['Body_Temperature'] >= 95) & (df['Body_Temperature'] <= 104)]  # Reasonable range in °F

In [12]:
# Final verification
print("\nFinal Data Summary:")
print(df.info())
print(df.describe())


Final Data Summary:
<class 'pandas.core.frame.DataFrame'>
Index: 1224 entries, 0 to 1229
Data columns (total 4 columns):
 #   Column            Non-Null Count  Dtype   
---  ------            --------------  -----   
 0   Body_Temperature  1224 non-null   float64 
 1   Heart_Rate        1224 non-null   int64   
 2   SPO2              1224 non-null   int64   
 3   Driver_State      1224 non-null   category
dtypes: category(1), float64(1), int64(2)
memory usage: 39.8 KB
None
       Body_Temperature   Heart_Rate         SPO2
count       1224.000000  1224.000000  1224.000000
mean          99.424338    74.325163    80.053105
std            2.600079     9.697408    11.876702
min           95.000000    54.000000    60.000000
25%           97.160000    67.000000    70.000000
50%           99.320000    74.000000    80.000000
75%          101.660000    81.000000    90.000000
max          104.000000   109.000000   100.000000


In [13]:
initial_count = len(df)
df = df.drop_duplicates()
removed = initial_count - len(df)
print(f"\nRemoved {removed} duplicate rows")
print(f"New shape: {df.shape}")



Removed 0 duplicate rows
New shape: (1224, 4)


In [15]:
def treat_outliers(col):
    q1 = df[col].quantile(0.25)
    q3 = df[col].quantile(0.75)
    iqr = q3 - q1
    lower = q1 - 1.5*iqr
    upper = q3 + 1.5*iqr
    return df[col].clip(lower, upper)

physio_cols = ['Body_Temperature', 'Heart_Rate', 'SPO2']
for col in physio_cols:
    df[col] = treat_outliers(col)
    print(f"\n{col} outliers treated:")
    print(f"New range: {df[col].min():.1f} to {df[col].max():.1f}")

# Convert Data Types
df['Driver_State'] = df['Driver_State'].astype('category')
print("\nData types optimized:")
print(df.dtypes)

# Biological Validation
print("\nFinal value ranges:")
print(f"Body Temperature: {df['Body_Temperature'].min():.1f}°F to {df['Body_Temperature'].max():.1f}°F")
print(f"Heart Rate: {df['Heart_Rate'].min()} to {df['Heart_Rate'].max()} bpm")
print(f"SPO2: {df['SPO2'].min()}% to {df['SPO2'].max()}%")



Body_Temperature outliers treated:
New range: 95.0 to 104.0

Heart_Rate outliers treated:
New range: 54.0 to 102.0

SPO2 outliers treated:
New range: 60.0 to 100.0

Data types optimized:
Body_Temperature     float64
Heart_Rate             int64
SPO2                   int64
Driver_State        category
dtype: object

Final value ranges:
Body Temperature: 95.0°F to 104.0°F
Heart Rate: 54 to 102 bpm
SPO2: 60% to 100%


In [16]:
output_file = 'cleaned_driver_physiology.csv'
df.to_csv(output_file, index=False)
files.download(output_file)


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

In [4]:
!pip install openpyxl

