In [None]:
import pandas as pd
from matplotlib import pyplot as plt
import pywt
from statsmodels.robust import mad
import json, os
from sklearn.preprocessing import MinMaxScaler, LabelEncoder
from sklearn.metrics import mean_squared_error
from sklearn import preprocessing
import numpy as np

In [None]:
def norm_data_f(data, colv):
    min_max_scaler = preprocessing.MinMaxScaler()
    data[colv] = min_max_scaler.fit_transform(data[colv].astype(float).values.reshape(-1,1))
    return data

def get_ori_data(raw_data_path):
    cols = ['date','month','hour','minute','dow', 'season','device_id', 'device_speed','k_index', 'volume','device_distance', 
        'INRIX_speed']
    data_df = pd.read_csv(raw_data_path)
    data_df = data_df[cols]
#     data_df = data_df[data_df['device_id'] == 11]
    data_df_sorted = data_df.sort_values(["device_id","month"])
    dfx = data_df_sorted.drop_duplicates(subset=['month','hour', 'minute','device_id'])
    hour_values = dfx["hour"].unique(); device_values = dfx["device_id"].unique();
    month_values = dfx["month"].unique();minute_values = dfx["minute"].unique();
    return [data_df_sorted, hour_values, minute_values, device_values, month_values]

def get_ori_test_data(raw_data_path):
    cols = ['date','month','hour','minute','dow', 'season','device_id', 'device_speed','k_index', 'volume','device_distance', 
        'INRIX_speed']
    data_df = pd.read_csv(raw_data_path)
    data_df = data_df[cols]
#     data_df = data_df[data_df['device_id'] == 1]
    data_df_sorted = data_df.sort_values(["device_id","month"])
    dfx = data_df_sorted.drop_duplicates(subset=['month','hour', 'minute','device_id'])
    hour_values = dfx["hour"].unique(); device_values = dfx["device_id"].unique();
    month_values = dfx["month"].unique();minute_values = dfx["minute"].unique();
    return [data_df_sorted, hour_values, minute_values, device_values, month_values]

def get_test_data(values):
    n_train_hours = int(len(values)*(3/3))
    test = values[:n_train_hours, :]
    test_X, test_y = test[:, :-1], test[:, -1]
    test_X = test_X.reshape((test_X.shape[0], 1, test_X.shape[1]))
    return [test_X,test_y]

def waveletSmooth( merged_data, smooth_columns):
    db1 = pywt.Wavelet('db1')
    for cur_col in smooth_columns:
        x = merged_data[cur_col]
        coeff = pywt.wavedec( x, db1)
        sigma = mad( coeff[-5] )
        coeff[1:] = ( pywt.threshold( i, value=sigma, mode="hard" ) for i in coeff[1:] )
        y = pywt.waverec( coeff, db1 )
        if len(y) >merged_data.shape[0]:
            y = y[0:-1]
            merged_data[cur_col] = y
        else:
            merged_data[cur_col] = y
    return merged_data

In [None]:
def generate_training_data(smooth, shift):
    # example:
    # raw_data_path = 'datasets/raw_data/Freeway12m_10months.csv'
    # generate_training_data(raw_data_path)
    train_folder = 'datasets/raw_data/train/'
    all_files = os.listdir(train_folder)
    for cur_file in all_files:
        raw_train_data_path = train_folder + cur_file
        filename = os.path.basename(cur_file).split('.csv')[0] + '_' + str(shift)
        if os.path.isfile('datasets/training_datasets/' + filename + '.csv'):
            os.remove('datasets/training_datasets/' + filename + '.csv')
            print ('existing file removed!')
        
        out_cols = ['device_id','dow','month','hour', 'minute', 'meadianv', 'mean','stdv','min','max']

        [data_df_sorted, hour_values, minute_values, device_values, month_values] = get_ori_data(raw_train_data_path)
        done = []
        outlist = []
        output = pd.DataFrame(columns=out_cols)
        output.to_csv('datasets/training_datasets/' + filename + '.csv', mode='a', header=True, index=False)
        for device in device_values:
            done.append(device)
            print ('Training Data for ' + str(len(done)) + ' of ' + str(len(device_values)) + ' ids Generated')
            for dow in range(0, 7):
                for cur_month in month_values:
                    for cur_hour in hour_values:
                        for cur_min in minute_values:
                            cur_data = data_df_sorted.loc[(data_df_sorted['device_id'] == device) &
                                                          (data_df_sorted['month'] == cur_month) &
                                                          (data_df_sorted['hour'] == cur_hour) & 
                                                          (data_df_sorted['minute'] == cur_min) &
                                                          (data_df_sorted['dow']==dow)]
                            if len(cur_data['INRIX_speed'].values)>0:
                                minVal = cur_data['INRIX_speed'].values
                                outlist = [int(device), int(dow), int(cur_month),
                                           int(cur_hour), int(cur_min), int(np.median(minVal)), 
                                           int(np.mean(minVal)), float(np.std(minVal)), int(np.min(minVal)),int(np.max(minVal))]
                                output.loc[len(output)] = outlist
                    output.to_csv('datasets/training_datasets/' + filename + '.csv', mode='a', header=False, index=False)
                    output = pd.DataFrame(columns=out_cols)
    
    
        training_data = pd.read_csv('datasets/training_datasets/' + filename + '.csv'); 
        training_data['device_id'] = training_data['device_id'].astype(int)
        training_data['dow'] = training_data['dow'].astype(int)
        training_data['month'] = training_data['month'].astype(int)
        training_data['hour'] = training_data['hour'].astype(int)
        training_data['minute'] = training_data['minute'].astype(int)
        merged_data = pd.merge(data_df_sorted, training_data, how='left', left_on=['device_id','dow','month','hour', 'minute'], 
                                      right_on=['device_id','dow','month','hour', 'minute'])
        os.remove('datasets/training_datasets/' + filename + '.csv')
        if smooth:
            smooth_columns = ['INRIX_speed','device_speed', 'volume','meadianv' ]
            merged_data = waveletSmooth(merged_data, smooth_columns)
            merged_data['speed_2'] = merged_data['INRIX_speed'].shift(shift)
            merged_data = merged_data[pd.notnull(merged_data['speed_2'])]
            merged_data = merged_data[pd.notnull(merged_data['k_index'])]
            merged_data.to_csv('datasets/training_datasets/' + filename + '.csv', index=False)
        else:
            merged_data['speed_2'] = merged_data['INRIX_speed'].shift(shift)
            merged_data = merged_data[pd.notnull(merged_data['speed_2'])]
            merged_data = merged_data[pd.notnull(merged_data['k_index'])]
            merged_data.to_csv('datasets/training_datasets/' + filename + '.csv', index=False)
        
    
#     print ('Training Data Completed .... ')
#     file_path = 'datasets/training_datasets/' + filename + '.csv'
#     return file_path

In [None]:
def generate_test_data(smooth, shift, new_cols):
    test_folder = 'datasets/raw_data/test/'
    all_files = os.listdir(test_folder)
    for cur_file in all_files:
        raw_test_data_path = test_folder + cur_file
        filename = os.path.basename(cur_file).split('.csv')[0] + '_' + str(shift)
        
#     filename = os.path.basename(raw_test_data_path).split('.csv')[0]
        if os.path.isfile('datasets/test_datasets/' + filename + '.csv'):
            os.remove('datasets/test_datasets/' + filename + '.csv')
            print ('existing file removed!')
        test_out_cols = ['device_id','dow','month','hour', 'minute', 'meadianv', 'mean','stdv','min','max']
        [test_data_df_sorted, hour_values, minute_values, device_values, month_values] = get_ori_test_data(raw_test_data_path)
        done = []
        outlist = []
        output = pd.DataFrame(columns=test_out_cols)
        output.to_csv('datasets/test_datasets/' + filename + '.csv', mode='a', header=True, index=False)
        for device in device_values:
            done.append(device)
            print ('Test Data for ' + str(len(done)) + ' of ' + str(len(device_values)) + ' ids Generated ..')
            for dow in range(0, 7):
                for cur_month in month_values:
                    for cur_hour in hour_values:
                        for cur_min in minute_values:
                            cur_data = test_data_df_sorted.loc[(test_data_df_sorted['device_id'] == device) &
                                                          (test_data_df_sorted['month'] == cur_month) &
                                                          (test_data_df_sorted['hour'] == cur_hour) & 
                                                          (test_data_df_sorted['minute'] == cur_min) &
                                                          (test_data_df_sorted['dow']==dow)]
                            if len(cur_data['INRIX_speed'].values)>0:
                                minVal = cur_data['INRIX_speed'].values
                                outlist = [int(device), int(dow), int(cur_month),
                                           int(cur_hour), int(cur_min), int(np.median(minVal)), 
                                           int(np.mean(minVal)), int(np.std(minVal)), int(np.min(minVal)),int(np.max(minVal))]
                                output.loc[len(output)] = outlist
                    output.to_csv('datasets/test_datasets/' + filename + '.csv', mode='a', header=False, index=False)
                    output = pd.DataFrame(columns=test_out_cols)

        testing_data = pd.read_csv('datasets/test_datasets/' + filename + '.csv')
        testing_data['device_id'] = testing_data['device_id'].astype(int)
        testing_data['dow'] = testing_data['dow'].astype(int)
        testing_data['month'] = testing_data['month'].astype(int)
        testing_data['hour'] = testing_data['hour'].astype(int)
        testing_data['minute'] = testing_data['minute'].astype(int)
        merged_data = pd.merge(test_data_df_sorted, testing_data, how='left', left_on=['device_id','dow','month','hour', 'minute'], 
                                      right_on=['device_id','dow','month','hour', 'minute'])

        os.remove('datasets/test_datasets/' + filename + '.csv')
        if smooth:
            print ('smoothing done')
            smooth_columns = ['INRIX_speed','device_speed', 'volume','meadianv' ]
            merged_data = waveletSmooth(merged_data, smooth_columns)
            merged_data['speed_2'] = merged_data['INRIX_speed'].shift(shift)
            merged_data = merged_data[pd.notnull(merged_data['speed_2'])]
            merged_data = merged_data[pd.notnull(merged_data['k_index'])]
            merged_data.to_csv('datasets/test_datasets/' + filename + '.csv', index=False)
        else:
            merged_data['speed_2'] = merged_data['INRIX_speed'].shift(shift)
            merged_data = merged_data[pd.notnull(merged_data['speed_2'])]
            merged_data = merged_data[pd.notnull(merged_data['k_index'])]
            merged_data.to_csv('datasets/test_datasets/' + filename + '.csv', index=False)
        
#     file_path = 'datasets/test_datasets/' + filename + '.csv'
#     print ('Generating Testing Parameters  Completed .... ')
#     return file_path

In [None]:
smooth=1; # for smoothing.. 0 for no smoothing
shifts=[1,3,6,12]; # 15 minutes
# shifts = [1]
for shift in shifts:
    generate_training_data(smooth, shift) 

In [None]:
# # raw_test_data_path = 'datasets/raw_data/Freeway12m_2months.csv'
new_cols = [ 'month','hour','dow','device_id','device_speed','meadianv','speed_2','INRIX_speed']
for shift in shifts:
    generate_test_data(smooth, shift, new_cols) 