In [5]:
def read_data():
    
    ''' A simple function to read the original dataset from csv to pandas DataFrame.

    Output:
    DataFrame with raw data
    '''
    
    df = pd.read_csv('../data/energydata_complete.csv')
    
    return df

In [1]:
def prepare_data(df):

    ''' Prepare the data for further analysis - conduct feature engineering and remove irrelevant columns.

    Input:
    df (DataFrame): Dataframe with raw dataset to be modified.

    Output:
    Dataframe ready for further analysis (further EDA / feature selection).
    '''
    
    # first we have to convert the date from object to datetime - in R it is originally a datetime format
    df['date'] = pd.to_datetime(df['date'])

    # prepare variables concerning datetime (relevant)
    df['month'] = df['date'].dt.month     
    df['day'] = df['date'].dt.day     
    df['hour'] = df['date'].dt.hour     
    df['minute'] = df['date'].dt.minute
    
    # extract weekday 
    df['day_of_week'] = df['date'].dt.dayofweek

    # create the variable concerning time of the day using information about the hour
    # 1 - morning, 2 - afternoon, 3 - evening
    condlist_ = [(df['hour'] >= 6)  &  (df['hour']  < 12), 
             (df['hour']) >= 12 &  (df['hour']  < 18)]

    choicelist_ = [1, 2]

    df['time_of_day'] = np.select(condlist_, choicelist_, default=3)
    
    # exclude irrelevant columns
    df.drop(columns=['date', 'rv1', 'rv2'], inplace=True)

    return df