## Data Cleaning and Feature Engineering

In [2]:
# Importing the libraries
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.compose import ColumnTransformer

In [3]:
# Reading the dataset
df = pd.read_csv('../copd_iot_dataset.csv')
df.head(3)

Unnamed: 0,co_level,alcohol_level,voc_level,spo2,heart_rate,pm25_level,temperature,label
0,130.122292,51.18151,119.667149,97.290211,75.378475,9.01721,36.330482,Healthy
1,306.918592,112.967456,252.810187,86.365809,94.352309,58.049402,37.260475,COPD
2,279.490251,119.974531,308.133251,89.58787,76.215453,40.37884,37.150093,COPD


In [4]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3000 entries, 0 to 2999
Data columns (total 8 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   co_level       3000 non-null   float64
 1   alcohol_level  3000 non-null   float64
 2   voc_level      3000 non-null   float64
 3   spo2           3000 non-null   float64
 4   heart_rate     3000 non-null   float64
 5   pm25_level     3000 non-null   float64
 6   temperature    3000 non-null   float64
 7   label          3000 non-null   object 
dtypes: float64(7), object(1)
memory usage: 187.6+ KB


In [5]:
# Checking for NaN values
df.isna().sum()

co_level         0
alcohol_level    0
voc_level        0
spo2             0
heart_rate       0
pm25_level       0
temperature      0
label            0
dtype: int64

In [6]:
# Checking for duplicates
df.duplicated().sum()

np.int64(0)

##### Feature Engineering

In [7]:
# Converting categorical column 'label' into numerical value
label_encoder = LabelEncoder()
df['label_encoded'] = label_encoder.fit_transform(df['label'])

In [8]:
df.head(3)

Unnamed: 0,co_level,alcohol_level,voc_level,spo2,heart_rate,pm25_level,temperature,label,label_encoded
0,130.122292,51.18151,119.667149,97.290211,75.378475,9.01721,36.330482,Healthy,2
1,306.918592,112.967456,252.810187,86.365809,94.352309,58.049402,37.260475,COPD,1
2,279.490251,119.974531,308.133251,89.58787,76.215453,40.37884,37.150093,COPD,1


#### Train test split

In [9]:
# Segregating the independent and dependent features
X = df.drop(columns=['label', 'label_encoded'], axis=1)
y = df['label_encoded']

In [10]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

#### Data Transformation

In [11]:
# Column Transformer for standard scaling
preprocessor = ColumnTransformer([
    ('StandardScaling', StandardScaler(), X.columns)
])

X_train_preprocessed = preprocessor.fit_transform(X_train)
X_test_preprocessed = preprocessor.transform(X_test)

In [12]:
X_train_preprocessed

array([[ 1.36223643,  1.32831472,  1.19135229, ...,  1.55729794,
         1.15006536, -0.63210787],
       [-1.41534409, -2.14251855, -1.10296361, ..., -0.59080161,
        -0.70970894, -0.96838431],
       [-1.12292792, -1.1517965 , -1.2578256 , ..., -1.024481  ,
        -1.38773398, -0.57035106],
       ...,
       [-1.23846579, -1.07164746, -1.0483516 , ..., -1.30648515,
        -0.58563671, -1.11354776],
       [-0.36710896, -0.14389095,  0.13639469, ..., -0.32565435,
         0.22328598,  0.36725564],
       [-1.22375815, -1.3945001 , -1.33886771, ...,  0.526731  ,
        -1.23682718, -0.63508429]])

In [13]:
X_test_preprocessed

array([[-0.1440443 ,  1.18071358, -0.29470645, ...,  0.14613873,
         0.4591848 , -1.37430887],
       [ 0.06055071,  0.28192707,  0.15403621, ...,  1.93558752,
         0.25122887, -0.52307966],
       [ 0.84858965, -0.05956638,  0.85642716, ...,  1.93928655,
         0.92643917, -0.03691726],
       ...,
       [-1.11377673, -1.26419655, -1.60294793, ..., -0.81437678,
        -0.90579855, -0.71343279],
       [-0.4054009 ,  0.08147954,  0.50920089, ...,  0.37632433,
        -0.29903748, -0.01969392],
       [ 1.12326752,  2.06523333,  1.52260245, ..., -0.10088129,
        -0.18101116,  0.08204125]])

#### Saving the preprocessor object and the train test datasets

In [14]:
# Saving the preprocessor object
import pickle
with open('../copd_models/preprocessor-scaler.pkl', 'wb') as file:
    pickle.dump(obj=preprocessor, file=file)

with open('../copd_models/preprocessor-label.pkl', 'wb') as file:
    pickle.dump(obj=label_encoder, file=file)

In [15]:
# Saving the train-test dataset
train_dataset = np.c_[X_train_preprocessed, np.array(y_train)]
test_dataset = np.c_[X_test_preprocessed, np.array(y_test)]

with open('../data_transformation/train.npy', 'wb') as file:
    np.save(arr=train_dataset, file=file)

with open('../data_transformation/test.npy', 'wb') as file:
    np.save(arr=test_dataset, file=file)