In [1]:
import os
os.chdir("../")

In [2]:
%pwd

'e:\\project\\Customer Purchase Prediction'

In [3]:
import pandas as pd

data = pd.read_csv("artifacts/data_ingestion/Sleep_Efficiency.csv")
data.head()

Unnamed: 0,ID,Age,Gender,Bedtime,Wakeup time,Sleep duration,Sleep efficiency,REM sleep percentage,Deep sleep percentage,Light sleep percentage,Awakenings,Caffeine consumption,Alcohol consumption,Smoking status,Exercise frequency
0,1,65,Female,2021-03-06 01:00:00,2021-03-06 07:00:00,6.0,0.88,18,70,12,0.0,0.0,0.0,Yes,3.0
1,2,69,Male,2021-12-05 02:00:00,2021-12-05 09:00:00,7.0,0.66,19,28,53,3.0,0.0,3.0,Yes,3.0
2,3,40,Female,2021-05-25 21:30:00,2021-05-25 05:30:00,8.0,0.89,20,70,10,1.0,0.0,0.0,No,3.0
3,4,40,Female,2021-11-03 02:30:00,2021-11-03 08:30:00,6.0,0.51,23,25,52,3.0,50.0,5.0,Yes,1.0
4,5,57,Male,2021-03-13 01:00:00,2021-03-13 09:00:00,8.0,0.76,27,55,18,3.0,0.0,3.0,No,3.0


In [4]:
print(data.isnull().sum())

ID                         0
Age                        0
Gender                     0
Bedtime                    0
Wakeup time                0
Sleep duration             0
Sleep efficiency           0
REM sleep percentage       0
Deep sleep percentage      0
Light sleep percentage     0
Awakenings                20
Caffeine consumption      25
Alcohol consumption       14
Smoking status             0
Exercise frequency         6
dtype: int64


In [5]:
data = data.apply(lambda x: x.fillna(x.mode()[0]) if x.isnull().any() else x)

In [6]:
data["Gender"] = data["Gender"].map({'Female': 0, 'Male': 1})
data["Smoking status"] = data["Smoking status"].map({'Yes': 1, 'No': 0})

In [7]:
data["Bedtime"] = pd.to_datetime(data["Bedtime"]).dt.hour
data["Wakeup time"] = pd.to_datetime(data["Wakeup time"]).dt.hour

In [8]:
data.columns

Index(['ID', 'Age', 'Gender', 'Bedtime', 'Wakeup time', 'Sleep duration',
       'Sleep efficiency', 'REM sleep percentage', 'Deep sleep percentage',
       'Light sleep percentage', 'Awakenings', 'Caffeine consumption',
       'Alcohol consumption', 'Smoking status', 'Exercise frequency'],
      dtype='object')

In [9]:
from collections import Counter

data["Caffeine consumption"] = data["Caffeine consumption"].apply(lambda x: 1 if x > 0 else 0)
print(Counter(data["Caffeine consumption"]))

data["Alcohol consumption"] = data["Alcohol consumption"].apply(lambda x: 1 if x > 0 else 0)
print(Counter(data["Alcohol consumption"]))

Counter({0: 236, 1: 216})
Counter({0: 260, 1: 192})


In [10]:
data.head()

Unnamed: 0,ID,Age,Gender,Bedtime,Wakeup time,Sleep duration,Sleep efficiency,REM sleep percentage,Deep sleep percentage,Light sleep percentage,Awakenings,Caffeine consumption,Alcohol consumption,Smoking status,Exercise frequency
0,1,65,0,1,7,6.0,0.88,18,70,12,0.0,0,0,1,3.0
1,2,69,1,2,9,7.0,0.66,19,28,53,3.0,0,1,1,3.0
2,3,40,0,21,5,8.0,0.89,20,70,10,1.0,0,0,0,3.0
3,4,40,0,2,8,6.0,0.51,23,25,52,3.0,1,1,1,1.0
4,5,57,1,1,9,8.0,0.76,27,55,18,3.0,0,1,0,3.0


In [11]:
data = data.drop(columns=["ID", "REM sleep percentage", "Deep sleep percentage", "Light sleep percentage"], axis=1)
data.head()

Unnamed: 0,Age,Gender,Bedtime,Wakeup time,Sleep duration,Sleep efficiency,Awakenings,Caffeine consumption,Alcohol consumption,Smoking status,Exercise frequency
0,65,0,1,7,6.0,0.88,0.0,0,0,1,3.0
1,69,1,2,9,7.0,0.66,3.0,0,1,1,3.0
2,40,0,21,5,8.0,0.89,1.0,0,0,0,3.0
3,40,0,2,8,6.0,0.51,3.0,1,1,1,1.0
4,57,1,1,9,8.0,0.76,3.0,0,1,0,3.0


In [12]:
for col in data.columns:
    print(col, ":", data[col].skew())

Age : 0.03990014652808803
Gender : -0.017758796606057715
Bedtime : 0.13128779837713753
Wakeup time : 0.07582168399973778
Sleep duration : 0.018783220040188328
Sleep efficiency : -0.65027356933683
Awakenings : 0.5547934662057271
Caffeine consumption : 0.08887755385634426
Alcohol consumption : 0.3053631302927665
Smoking status : 0.6744332516491234
Exercise frequency : 0.14405427418266803


In [13]:
from sklearn.preprocessing import StandardScaler

In [14]:
scaler = StandardScaler()
data['Awakenings'] = scaler.fit_transform(data[['Awakenings']])

In [15]:
data.head()

Unnamed: 0,Age,Gender,Bedtime,Wakeup time,Sleep duration,Sleep efficiency,Awakenings,Caffeine consumption,Alcohol consumption,Smoking status,Exercise frequency
0,65,0,1,7,6.0,0.88,-1.211366,0,0,1,3.0
1,69,1,2,9,7.0,0.66,1.041875,0,1,1,3.0
2,40,0,21,5,8.0,0.89,-0.460286,0,0,0,3.0
3,40,0,2,8,6.0,0.51,1.041875,1,1,1,1.0
4,57,1,1,9,8.0,0.76,1.041875,0,1,0,3.0


In [16]:
from dataclasses import dataclass
from pathlib import Path

@dataclass(frozen=True)
class DataTransformationConfig:
    root_dir: Path
    data_path: Path

In [17]:
from src.utils import read_yaml, create_directories

In [18]:
class ConfigurationManager:
    def __init__(self):
        self.config = read_yaml(Path("config.yml"))
        self.schema = read_yaml(Path("schema.yml"))

        create_directories([self.config.artifacts_root])

    def get_data_transformation_config(self) -> DataTransformationConfig:
        config = self.config.data_transformation

        create_directories([config.root_dir])

        data_transformation_config = DataTransformationConfig(
            root_dir=config.root_dir,
            data_path=config.data_path,
        )

        return data_transformation_config

In [23]:
import os
from src.logger import logging
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
import pandas as pd

In [24]:
class DataTransformation:
    def __init__(self, config: DataTransformationConfig):
        self.config = config

    def transform(self):
        data = pd.read_csv(self.config.data_path)
        
        # cleaning and preprocessing data
        data = data.apply(lambda x: x.fillna(x.mode()[0]) if x.isnull().any() else x) # filling all null values with most frequent

        # converting some columns to binary 0 and 1
        data["Gender"] = data["Gender"].map({'Female': 0, 'Male': 1})
        data["Caffeine consumption"] = data["Caffeine consumption"].apply(lambda x: 1 if x > 0 else 0)
        data["Alcohol consumption"] = data["Alcohol consumption"].apply(lambda x: 1 if x > 0 else 0)
        data["Smoking status"] = data["Smoking status"].map({'Yes': 1, 'No': 0})

        #converting time columns to just the hour
        data["Bedtime"] = pd.to_datetime(data["Bedtime"]).dt.hour
        data["Wakeup time"] = pd.to_datetime(data["Wakeup time"]).dt.hour
        
        # scaling the awakenings column
        scaler = StandardScaler()
        data['Awakenings'] = scaler.fit_transform(data[['Awakenings']])

        # dropping other potential target columns
        data = data.drop(columns=["ID", "REM sleep percentage", "Deep sleep percentage", "Light sleep percentage"], axis=1)
        
        train, test = train_test_split(data)
        
        train.to_csv(os.path.join(self.config.root_dir, "train.csv"), index=False)
        test.to_csv(os.path.join(self.config.root_dir, "test.csv"), index=False)

        logging.info("Splitting data into training and testing sets")
        logging.info(train.shape)
        logging.info(test.shape)

In [25]:
from src.exception import CustomException
import sys

In [26]:
try:
    config = ConfigurationManager()
    data_transformation_config = config.get_data_transformation_config()
    data_transformation = DataTransformation(config = data_transformation_config)
    data_transformation.transform()
except Exception as e:
    raise CustomException(e, sys)