In [None]:
import pandas as pd
import numpy as np
from datetime import datetime
from datetime import timedelta
from sklearn.preprocessing import OneHotEncoder
import matplotlib.pyplot as plt
import random

In [None]:
def preprocessing_stuff(load_path, save_path): 
    df=pd.read_csv(load_path, sep=',',header=[0,1,2], encoding='iso-8859-1')
    df.columns = df.columns.droplevel(level=[0, 2])

    def timestamp(x):
        i = x.replace(" CEST",'').strip()
        i = i.replace("CET",'').strip()
        t = datetime.strptime(i, '%d-%b-%y %I:%M:%S %p')
        t = (t - t.replace(hour=0, minute=0, second=0, microsecond=0)).total_seconds()
        return t/86400

    def weekday(x): 
        i = x.replace(" CEST",'').strip()
        i = i.replace("CET",'').strip()
        t = datetime.strptime(i, '%d-%b-%y %I:%M:%S %p')
        return t.weekday() / 6

    df1 = df
    df1 = df1.drop(df.index[252566:])
    df1 = df1.drop(df.index[:49643])
    df1.index = pd.RangeIndex(len(df1.index))
    df1 = df1.replace(r'\s*Keine Anforderung\s*', 0,regex=True)
    df1 = df1.replace(r'\s*keine Daten verfügbar\s*', 0,regex=True)
    df1 = df1.replace(r'\s*Anforderung\s*', 1,regex=True)
    df1 = df1.replace('NaN*', np.nan, regex=True)
    df1 = df1.replace('nan*', np.nan, regex=True)
    df1.iloc[0,15] = "1.0"
    df1 = df1.fillna(method='ffill')
    
    # add day of week
    df1['Weekday'] = df1.iloc[:, 0].apply(weekday)

    # DONT LOOK AT IT!
    df1.iloc[:, 1:14] = df1.iloc[:, 1:14].apply(pd.to_numeric)
    df1.iloc[1:, 1] = df1.iloc[1:, 1] | df1.iloc[1:, 2]
    df1.iloc[1:, 3] = df1.iloc[1:, 3] | df1.iloc[1:, 4]
    df1.iloc[1:, 5] = df1.iloc[1:, 5] | df1.iloc[1:, 6]
    df1.iloc[1:, 7] = df1.iloc[1:, 7] | df1.iloc[1:, 8]
    df1.iloc[1:, 9] = df1.iloc[1:, 9] | df1.iloc[1:, 10]
    df1.iloc[1:, 11] = df1.iloc[1:, 11] | df1.iloc[1:, 12]
    df1.iloc[1:, 13] = df1.iloc[1:, 13] | df1.iloc[1:, 14]
    df1 = df1.drop(df.columns[[2,4,6,8,10,12,14]],axis=1)
    df1.iloc[:,0] = df1.iloc[:,0].apply(timestamp,)
    df1.columns = ["Timestamp","3","4","5","6","7","2","1","Floor","Cycles", 'Weekday']
    df1 = df1[["Timestamp","1","2","3","4","5","6","7","Floor","Cycles", 'Weekday']]

    # add directions
    directions = [0]
    for index, row in df1.iterrows():
        if index == 0:
            continue
        direction = float(df1.loc[index, 'Floor']) - float(df1.loc[index - 1, 'Floor'])
        directions.append(direction)

    directions = np.sign(directions)
    
    df2 = df1
    df2['Direction'] = directions

    return df2

In [None]:
df = preprocessing_stuff('./data/TrainingdataElevator.csv', None)

In [None]:
def fill_missing_label_stuff(df):
    df = df.copy()
    last_stop = 0

    # make it deterministic
    random.seed(43)

    # probability prediction for the poor
    # 25% percent that there is a request
    prop = [0, 0, 0, 1]

    for index, row in df.iterrows():
        if index == (df.shape[0]-1): 
            break
        # stopped or changed direction
        if ((row[-1] == 0.0) & (row[-1] != df.iloc[index + 1, -1])) | ((row[-1] != 0.0) & ((row[-1] + df.iloc[index + 1, -1]) == 0.0)): 
            current_floor = float(row[8].strip())

            # 1.0 is parking spot, only assign request for propability
            if current_floor == 1.0: 
                # make a prediction 
                draw = random.choice(prop)
                # print(f'index: {index}, draw: {draw}')
                if draw == 0: 
                    continue

            df.iloc[last_stop, int(current_floor)] = 1

            # print(f'index: {index}, floor: {current_floor}')
            last_stop = index

    return df

In [None]:
df1 = fill_missing_label_stuff(df)

In [None]:
def generate_label_stuff(df, save_path):
    # generate labels
    df4 = df.copy()
    last_index = 0
    queue = np.array([[0, 0]])
    ttl = 5

    labels = np.array([])

    for index, row in df4.iterrows():
        if index == 0: 
            continue

        # new request
        if np.sum(row[1:8]) > 0: 
            idx = np.array(np.where(row[1:8] > 0)).ravel() + 1
            #print(f'Req: {idx}, Row: {index}')
            for id in idx: 
                # check if req is already queued
                duplicate = np.where(queue[:, 0] == id)
                
                if np.array(duplicate).size == 0: 
                    queue = np.append(queue, np.array([[id, ttl]]), axis=0)
                else: 
                    queue[duplicate, 1] = ttl

        # stopped 
        if (row[-1] == 0.0) & (row[-1] != df4.iloc[index-1, -1]): 
            # decrease ttl 
            queue[:, 1] -= 1

            # check for matches
            current_floor = row[8]
            match = np.where(queue[:ttl, 0] ==float(current_floor.strip()))
            #print(f'Match: {queue[match]}, Row: {index}')


            # 
            if np.array(match).size > 0: 
                #print(f'Label: {np.full((index - last_index), queue[match, 0])}, Match: {queue[match, 0]}')
                labels = np.append(labels, np.full((index - last_index), queue[match, 0]), axis=0)
                last_index = index
            
            # delete if match or ttl is zero
            queue = np.delete(queue, match, axis=0)
            queue = np.delete(queue, np.where(queue[:, 1] < 0), axis=0)

    df_labels = pd.DataFrame(labels)
    df_labels.to_csv('./' + ''.join(save_path.split('.')[:-1]) + '_label.csv', index=False)
    
    # normalize floor
    df4.loc[:, 'Floor'] = (df4.loc[:, 'Floor'].astype(float) - 1.0) / 6.0
      
    # onehot encoding
    encoder = OneHotEncoder(categories=[[1., 2., 3., 4., 5., 6., 7.]], sparse=False)
    df_oh = pd.DataFrame(encoder.fit_transform(df_labels))

    df_oh.to_csv('./' + ''.join(save_path.split('.')[:-1]) + '_label_oh.csv', index=False)
    df4.to_csv(save_path, index=False)

    return df4[:df_labels.shape[0]], df_labels, df_oh

In [None]:
X_train, y_train, y_train_oh = generate_label_stuff(df1, './data/mid_part.csv')