# Objective : Data Cleaning (sleepDay)

### Importing Required Libraries ->

In [2]:
import pandas as pd
import numpy as np 
import matplotlib.pyplot as plt
import seaborn as sns

### Load the Dataset ->

In [3]:
df = pd.read_csv("G:\My Drive\Prepinsta - Data Analytics\Week 8 - Task 8\Other\sleepDay_merged.csv",index_col=0)

### Preview the DataSet ->

In [4]:
df.head()

Unnamed: 0_level_0,SleepDay,TotalSleepRecords,TotalMinutesAsleep,TotalTimeInBed
Id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
1503960366,4/12/2016 12:00:00 AM,1,327,346
1503960366,4/13/2016 12:00:00 AM,2,384,407
1503960366,4/15/2016 12:00:00 AM,1,412,442
1503960366,4/16/2016 12:00:00 AM,2,340,367
1503960366,4/17/2016 12:00:00 AM,1,700,712


In [5]:
# Check the shape of dataset
df.shape

(413, 4)

In [6]:
# check if is there any null values in the dataset
df.isnull().sum()

SleepDay              0
TotalSleepRecords     0
TotalMinutesAsleep    0
TotalTimeInBed        0
dtype: int64

In [7]:
# check the datatype of all the colimns in dataset
df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 413 entries, 1503960366 to 8792009665
Data columns (total 4 columns):
 #   Column              Non-Null Count  Dtype 
---  ------              --------------  ----- 
 0   SleepDay            413 non-null    object
 1   TotalSleepRecords   413 non-null    int64 
 2   TotalMinutesAsleep  413 non-null    int64 
 3   TotalTimeInBed      413 non-null    int64 
dtypes: int64(3), object(1)
memory usage: 16.1+ KB


### Splitting of SleepDay column in Date and Time Respectively ->

In [11]:
# Define a function to split the ActivityHour column into ActivityDay and ActivityTime
def split_activity_hour(df, column_name='SleepDay'):
    # Split the column into two parts: Date and Time
    split_columns = df[column_name].str.split(' ', expand=True)
    # Assign the split columns to the dataframe
    df['ActivityDay'] = pd.to_datetime(split_columns[0])  # Convert to datetime format
    df['ActivityTime'] = split_columns[1]
    # Drop the original ActivityHour column
    df.drop(column_name, axis=1, inplace=True)
    return df

In [12]:
# Apply the function to each dataset
df = split_activity_hour(df)
df.head()

Unnamed: 0_level_0,TotalSleepRecords,TotalMinutesAsleep,TotalTimeInBed,ActivityDay,ActivityTime
Id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
1503960366,1,327,346,2016-04-12,12:00:00
1503960366,2,384,407,2016-04-13,12:00:00
1503960366,1,412,442,2016-04-15,12:00:00
1503960366,2,340,367,2016-04-16,12:00:00
1503960366,1,700,712,2016-04-17,12:00:00


### Standerdize the column names ->

In [13]:
# Standardize column names
df.columns = df.columns.str.lower().str.replace(r'([a-z])([A-Z])', r'\1_\2').str.replace(r'[^a-z0-9_]', '', regex=True)

# Display the first few rows of the dataframe to verify the new column names
df.head()

Unnamed: 0_level_0,totalsleeprecords,totalminutesasleep,totaltimeinbed,activityday,activitytime
Id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
1503960366,1,327,346,2016-04-12,12:00:00
1503960366,2,384,407,2016-04-13,12:00:00
1503960366,1,412,442,2016-04-15,12:00:00
1503960366,2,340,367,2016-04-16,12:00:00
1503960366,1,700,712,2016-04-17,12:00:00


### Rearrange the position of 'activityday' and 'activitytime' column ->

In [14]:
# Rearrange the position of 'activityday' column to be adjacent to 'id' column
cols = list(df.columns)
cols.insert(0, cols.pop(cols.index('activityday')))  # Move 'activityday' and 'acivitytime' next to 'id'
cols.insert(1, cols.pop(cols.index('activitytime'))) 
df = df[cols]

# Display the first few rows of the dataframe to verify the rearrangement
df.head()

Unnamed: 0_level_0,activityday,activitytime,totalsleeprecords,totalminutesasleep,totaltimeinbed
Id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
1503960366,2016-04-12,12:00:00,1,327,346
1503960366,2016-04-13,12:00:00,2,384,407
1503960366,2016-04-15,12:00:00,1,412,442
1503960366,2016-04-16,12:00:00,2,340,367
1503960366,2016-04-17,12:00:00,1,700,712


### Formattin the activityday column as datetime datatype ->

In [15]:
# Convert 'activityday' and to datetime format
df['activityday'] = pd.to_datetime(df['activityday'], format='%d-%m-%Y')
df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 413 entries, 1503960366 to 8792009665
Data columns (total 5 columns):
 #   Column              Non-Null Count  Dtype         
---  ------              --------------  -----         
 0   activityday         413 non-null    datetime64[ns]
 1   activitytime        413 non-null    object        
 2   totalsleeprecords   413 non-null    int64         
 3   totalminutesasleep  413 non-null    int64         
 4   totaltimeinbed      413 non-null    int64         
dtypes: datetime64[ns](1), int64(3), object(1)
memory usage: 19.4+ KB


### Remove Duplicate values if any ->

In [16]:
# chek if is there any duplicate values in dataset
duplicates = duplicates = df.duplicated().sum()
duplicates

4

In [17]:
# Remove duplicate rows, keeping the first occurrence of each duplicate
df = df.drop_duplicates()

### Export the Dataset ->

In [18]:
df.to_csv("G:\My Drive\Prepinsta - Data Analytics\Week 8 - Task 8\Master Datasets\sleepDay_data.csv")