In [1]:
import os

In [2]:
%pwd

'c:\\Users\\RaviB\\GitHub\\SleepEfficiencyML\\research'

In [3]:
os.chdir("../")

In [4]:
%pwd

'c:\\Users\\RaviB\\GitHub\\SleepEfficiencyML'

## EDA

Let's explore, clean, and figure out what transformations we need.

In [59]:
import pandas as pd

data = pd.read_csv("artifacts/data_ingestion/Sleep_Efficiency.csv")
data_og = data.copy() #make a copy just in case
data.head()

Unnamed: 0,ID,Age,Gender,Bedtime,Wakeup time,Sleep duration,Sleep efficiency,REM sleep percentage,Deep sleep percentage,Light sleep percentage,Awakenings,Caffeine consumption,Alcohol consumption,Smoking status,Exercise frequency
0,1,65,Female,2021-03-06 01:00:00,2021-03-06 07:00:00,6.0,0.88,18,70,12,0.0,0.0,0.0,Yes,3.0
1,2,69,Male,2021-12-05 02:00:00,2021-12-05 09:00:00,7.0,0.66,19,28,53,3.0,0.0,3.0,Yes,3.0
2,3,40,Female,2021-05-25 21:30:00,2021-05-25 05:30:00,8.0,0.89,20,70,10,1.0,0.0,0.0,No,3.0
3,4,40,Female,2021-11-03 02:30:00,2021-11-03 08:30:00,6.0,0.51,23,25,52,3.0,50.0,5.0,Yes,1.0
4,5,57,Male,2021-03-13 01:00:00,2021-03-13 09:00:00,8.0,0.76,27,55,18,3.0,0.0,3.0,No,3.0


In [60]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 452 entries, 0 to 451
Data columns (total 15 columns):
 #   Column                  Non-Null Count  Dtype  
---  ------                  --------------  -----  
 0   ID                      452 non-null    int64  
 1   Age                     452 non-null    int64  
 2   Gender                  452 non-null    object 
 3   Bedtime                 452 non-null    object 
 4   Wakeup time             452 non-null    object 
 5   Sleep duration          452 non-null    float64
 6   Sleep efficiency        452 non-null    float64
 7   REM sleep percentage    452 non-null    int64  
 8   Deep sleep percentage   452 non-null    int64  
 9   Light sleep percentage  452 non-null    int64  
 10  Awakenings              432 non-null    float64
 11  Caffeine consumption    427 non-null    float64
 12  Alcohol consumption     438 non-null    float64
 13  Smoking status          452 non-null    object 
 14  Exercise frequency      446 non-null    fl

We already know from the data validation notebook that we have some null values so let's deal with those.

In [61]:
print(data.isnull().sum())

ID                         0
Age                        0
Gender                     0
Bedtime                    0
Wakeup time                0
Sleep duration             0
Sleep efficiency           0
REM sleep percentage       0
Deep sleep percentage      0
Light sleep percentage     0
Awakenings                20
Caffeine consumption      25
Alcohol consumption       14
Smoking status             0
Exercise frequency         6
dtype: int64


In [62]:
from collections import Counter

In [63]:
data = data.apply(lambda x: x.fillna(x.mode()[0]) if x.isnull().any() else x)

In [64]:
print(data.isnull().sum())

ID                        0
Age                       0
Gender                    0
Bedtime                   0
Wakeup time               0
Sleep duration            0
Sleep efficiency          0
REM sleep percentage      0
Deep sleep percentage     0
Light sleep percentage    0
Awakenings                0
Caffeine consumption      0
Alcohol consumption       0
Smoking status            0
Exercise frequency        0
dtype: int64


Now let's do some take a look at other columns.

In [65]:
Counter(data["Gender"])

Counter({'Female': 224, 'Male': 228})

In [66]:
# just convert to 0 and 1
data["Gender"] = data["Gender"].map({'Female': 0, 'Male': 1})

For the bedtime and wakeup time the hour is the only important part really.

In [67]:
data["Bedtime"] = pd.to_datetime(data["Bedtime"]).dt.hour
data["Wakeup time"] = pd.to_datetime(data["Wakeup time"]).dt.hour

Now for sleep duration, we'll just leave it as an int.

In [68]:
Counter(data["Sleep duration"])

Counter({6.0: 34,
         7.0: 154,
         8.0: 103,
         7.5: 88,
         10.0: 8,
         9.0: 26,
         8.5: 28,
         5.0: 8,
         5.5: 3})

The Sleep efficiency, REM sleep percentage, Deep sleep percentage, and Light sleep percentage are all columns really target variables. We only care about sleep efficiency here but could add the others later.

In [69]:
data.columns

Index(['ID', 'Age', 'Gender', 'Bedtime', 'Wakeup time', 'Sleep duration',
       'Sleep efficiency', 'REM sleep percentage', 'Deep sleep percentage',
       'Light sleep percentage', 'Awakenings', 'Caffeine consumption',
       'Alcohol consumption', 'Smoking status', 'Exercise frequency'],
      dtype='object')

In [70]:
Counter(data["Awakenings"])

Counter({0.0: 95, 3.0: 63, 1.0: 174, 2.0: 57, 4.0: 63})

Let's leave awakenings as is.

In [71]:
Counter(data["Caffeine consumption"])

Counter({0.0: 236, 50.0: 107, 25.0: 79, 75.0: 25, 200.0: 4, 100.0: 1})

In [72]:
Counter(data["Alcohol consumption"])

Counter({0.0: 260, 3.0: 48, 5.0: 30, 1.0: 54, 2.0: 37, 4.0: 23})

These are in mg. These values don't seem realistic, let's just change it to true or false on whether or not caffiene was consumed the day before. Let's do the same thing for alcohol.

In [73]:
data["Caffeine consumption"] = data["Caffeine consumption"].apply(lambda x: 1 if x > 0 else 0)
Counter(data["Caffeine consumption"])

Counter({0: 236, 1: 216})

In [74]:
data["Alcohol consumption"] = data["Alcohol consumption"].apply(lambda x: 1 if x > 0 else 0)
Counter(data["Alcohol consumption"])

Counter({0: 260, 1: 192})

Last two columns, smoking status just needs to be converted into binary int column.

In [75]:
data["Smoking status"][0:5]

0    Yes
1    Yes
2     No
3    Yes
4     No
Name: Smoking status, dtype: object

In [76]:
data["Smoking status"] = data["Smoking status"].map({'Yes': 1, 'No': 0})
data["Smoking status"][0:5]

0    1
1    1
2    0
3    1
4    0
Name: Smoking status, dtype: int64

Now looking at the exercise frequency below, we can see exercising 0 or 1 time per week are common and both would be considered infrequent, so let's lump them together. Then combine the rest and make this a binary column.

In [77]:
Counter(data["Exercise frequency"])

Counter({3.0: 136, 1.0: 97, 0.0: 116, 5.0: 8, 2.0: 54, 4.0: 41})

In [78]:
data["Exercise frequency"] = data["Exercise frequency"].apply(lambda x: 1 if x > 1 else 0)
Counter(data["Exercise frequency"])

Counter({1: 239, 0: 213})

In [79]:
data.head()

Unnamed: 0,ID,Age,Gender,Bedtime,Wakeup time,Sleep duration,Sleep efficiency,REM sleep percentage,Deep sleep percentage,Light sleep percentage,Awakenings,Caffeine consumption,Alcohol consumption,Smoking status,Exercise frequency
0,1,65,0,1,7,6.0,0.88,18,70,12,0.0,0,0,1,1
1,2,69,1,2,9,7.0,0.66,19,28,53,3.0,0,1,1,1
2,3,40,0,21,5,8.0,0.89,20,70,10,1.0,0,0,0,1
3,4,40,0,2,8,6.0,0.51,23,25,52,3.0,1,1,1,0
4,5,57,1,1,9,8.0,0.76,27,55,18,3.0,0,1,0,1


We can drop the target columns that we don't need for now, and also the ID column.

In [80]:
data = data.drop(columns=["ID", "REM sleep percentage", "Deep sleep percentage", "Light sleep percentage"], axis=1)
data.head()

Unnamed: 0,Age,Gender,Bedtime,Wakeup time,Sleep duration,Sleep efficiency,Awakenings,Caffeine consumption,Alcohol consumption,Smoking status,Exercise frequency
0,65,0,1,7,6.0,0.88,0.0,0,0,1,1
1,69,1,2,9,7.0,0.66,3.0,0,1,1,1
2,40,0,21,5,8.0,0.89,1.0,0,0,0,1
3,40,0,2,8,6.0,0.51,3.0,1,1,1,0
4,57,1,1,9,8.0,0.76,3.0,0,1,0,1


Now back to modularizing the code.

In [81]:
from dataclasses import dataclass
from pathlib import Path

@dataclass(frozen=True)
class DataTransformationConfig:
    root_dir: Path
    data_path: Path

In [82]:
from sleep_efficiency.constants import *
from sleep_efficiency.utils.common import read_yaml, create_directories

In [83]:
class ConfigurationManager:
    def __init__(
        self,
        config_filepath = CONFIG_FILE_PATH,
        params_filepath = PARAMS_FILE_PATH,
        schema_filepath = SCHEMA_FILE_PATH):
    
        self.config = read_yaml(config_filepath)
        self.params = read_yaml(params_filepath)
        self.schema = read_yaml(schema_filepath)

        create_directories([self.config.artifacts_root])

    def get_data_transformation_config(self) -> DataTransformationConfig:
        config = self.config.data_transformation

        create_directories([config.root_dir])

        data_transformation_config = DataTransformationConfig(
            root_dir=config.root_dir,
            data_path=config.data_path,
        )

        return data_transformation_config

In [84]:
import os
from sleep_efficiency import logger
from sklearn.model_selection import train_test_split
import pandas as pd

In [85]:
class DataTransformation:
    def __init__(self, config: DataTransformationConfig):
        self.config = config

    def train_test_splitting(self):
        data = pd.read_csv(self.config.data_path)
        
        # cleaning and preprocessing data
        data = data.apply(lambda x: x.fillna(x.mode()[0]) if x.isnull().any() else x) # filling all null values with most frequent

        # converting some columns to binary 0 and 1
        data["Gender"] = data["Gender"].map({'Female': 0, 'Male': 1})
        data["Caffeine consumption"] = data["Caffeine consumption"].apply(lambda x: 1 if x > 0 else 0)
        data["Alcohol consumption"] = data["Alcohol consumption"].apply(lambda x: 1 if x > 0 else 0)
        data["Smoking status"] = data["Smoking status"].map({'Yes': 1, 'No': 0})
        data["Exercise frequency"] = data["Exercise frequency"].apply(lambda x: 1 if x > 1 else 0)

        #converting time columns to just the hour
        data["Bedtime"] = pd.to_datetime(data["Bedtime"]).dt.hour
        data["Wakeup time"] = pd.to_datetime(data["Wakeup time"]).dt.hour

        # dropping other potential target columns
        data = data.drop(columns=["ID", "REM sleep percentage", "Deep sleep percentage", "Light sleep percentage"], axis=1)
        
        train, test = train_test_split(data)
        
        train.to_csv(os.path.join(self.config.root_dir, "train.csv"), index=False)
        test.to_csv(os.path.join(self.config.root_dir, "test.csv"), index=False)

        logger.info("Splitting data into training and testing sets")
        logger.info(train.shape)
        logger.info(test.shape)

        print(train.shape)
        print(test.shape)

In [86]:
try:
    config = ConfigurationManager()
    data_transformation_config = config.get_data_transformation_config()
    data_transformation = DataTransformation(config = data_transformation_config)
    data_transformation.train_test_splitting()
except Exception as e:
    raise e

[2024-06-25 22:59:32,006: INFO: common: yaml file: config\config.yaml loaded successfully]
[2024-06-25 22:59:32,012: INFO: common: yaml file: params.yaml loaded successfully]
[2024-06-25 22:59:32,021: INFO: common: yaml file: schema.yaml loaded successfully]
[2024-06-25 22:59:32,022: INFO: common: created directory at artifacts]
[2024-06-25 22:59:32,024: INFO: common: created directory at artifacts/data_transformation]
[2024-06-25 22:59:32,052: INFO: 1400045423: Splitting data into training and testing sets]
[2024-06-25 22:59:32,053: INFO: 1400045423: (339, 11)]
[2024-06-25 22:59:32,054: INFO: 1400045423: (113, 11)]
(339, 11)
(113, 11)
