<a href="https://colab.research.google.com/github/SwaksharDeb/Activity-recognition/blob/master/data_preprocessing_v3.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
#@title Download and extract the dataset

URL = "https://ieee-dataport.s3.amazonaws.com/open/11167/Training.zip?response-content-disposition=attachment%3B%20filename%3D%22Training.zip%22&X-Amz-Algorithm=AWS4-HMAC-SHA256&X-Amz-Credential=AKIAJOHYI4KJCE6Q7MIQ%2F20200616%2Fus-east-1%2Fs3%2Faws4_request&X-Amz-Date=20200616T082632Z&X-Amz-SignedHeaders=Host&X-Amz-Expires=86400&X-Amz-Signature=945354b7acb38f2172b8978a73ceffbf5943e91a14442c7314dbb666430fb079" #@param {type : "string"} 
savepath = "Training.zip" #@param {type : 'string'}
extractpath = "/content/dataset/" #@param {type : 'string'}

from zipfile import ZipFile
from urllib.request import urlretrieve
import os

if not os.path.isfile(savepath):
  urlretrieve(URL, savepath)
with ZipFile(savepath, 'r') as zip_file:
  zip_file.extractall(extractpath)

!rm -rf /content/sample_data/

In [None]:
#@title Install Dependencies
from google.colab import files

in_file = files.upload()

if len(in_file.keys()) == 1:
  for fn in in_file.keys():
    requirement = "/content/" + str(fn)

!pip install -r $requirement

<h1> Import Dependencies </h1>

In [None]:
import numpy as np
import pandas as pd
import math
import datetime
import dateutil

In [None]:
#@title Loading the dataset

accel_dataset_file = "/content/training/Lab/bigact_raw_lab_acc.csv" #@param {type : "string"}
label_dataset_file = "/content/training/Lab/labels_lab_2users.csv" #@param {type : "string"}
ignore_seconds = True #@param {type : "boolean"}

accel_dataset = pd.read_csv(accel_dataset_file, na_filter=False, parse_dates=[1], infer_datetime_format=True, date_parser=lambda col : pd.to_datetime(col, utc=True))
label_dataset = pd.read_csv(label_dataset_file, na_filter=False, parse_dates=[2, 3], infer_datetime_format=True, date_parser=lambda col : pd.to_datetime(col, utc=True))

if ignore_seconds:
  accel_dataset['datetime'] = [pd.Timestamp.replace(x, second=0, microsecond=0) for x in accel_dataset['datetime']]
  label_dataset['start'] = [pd.Timestamp.replace(x, second=0, microsecond=0) for x in label_dataset['start']]
  label_dataset['finish'] = [pd.Timestamp.replace(x, second=0, microsecond=0) for x in label_dataset['finish']]

# Data Processing

<h3> Sort the values according to datetime </h3>

In [None]:
accel_dataset = pd.DataFrame.sort_values(accel_dataset, ['datetime'], ignore_index=True)
label_dataset = pd.DataFrame.sort_values(label_dataset, ['start', 'finish'], ignore_index=True)

<h3> Truncate rows from accelerometer dataset whose datetime that do not correspond with label dataset datetime </h3>

In [None]:
label_first_time_entry = label_dataset.iloc[0].loc['start']
label_last_time_entry = label_dataset.iloc[-1].loc['start']

trunc_accel_dataset = accel_dataset[accel_dataset['datetime'] >= label_first_time_entry]
trunc_accel_dataset = trunc_accel_dataset[trunc_accel_dataset['datetime'] <= label_last_time_entry]
trunc_accel_dataset.reset_index(drop=True, inplace=True)


<h3> Extract accelerometer data that corresponds to label datetime </h3>

In [None]:
final_aligned_dataset = []

for i in label_dataset.index.values:
  start_date = label_dataset.at[i, 'start']
  end_date = label_dataset.at[i, 'finish']
  user_id = label_dataset.at[i, 'user_id']

  mask = ((trunc_accel_dataset['datetime']  >= start_date) & (trunc_accel_dataset['datetime'] <= end_date) & (trunc_accel_dataset['user_id'] == user_id))
  
  masked_dataset = trunc_accel_dataset.loc[mask].loc[:, ['x', 'y', 'z']]
  if not masked_dataset.empty:
    act_series = pd.Series(label_dataset.iat[i, 1]).repeat(masked_dataset.shape[0])
    
    # must reset index for concat to succeed
    act_series.reset_index(drop=True, inplace=True)
    masked_dataset.reset_index(drop=True, inplace=True)
    
    chunk = pd.concat([masked_dataset, act_series], ignore_index=True, axis=1)
    final_aligned_dataset.append(chunk)

<h3> Generate aligned dataframe </h3>

In [None]:
final_aligned_dataset = pd.concat(final_aligned_dataset)
final_aligned_dataset.columns = ['x', 'y', 'z', 'act_id']
# final_aligned_dataset.drop_duplicates(inplace=True, ignore_index=True)

In [None]:
final_aligned_dataset['x'].value_counts()

 0.191    18321
 0.076        9
 0.114        8
-2.030        7
 0.344        7
-0.344        6
-0.498        6
 0.268        6
-1.992        5
-0.383        5
 0.153        4
-0.306        4
-1.953        4
 0.459        4
-2.106        3
 0.421        3
-1.915        3
 0.383        3
 0.689        2
-2.604        2
-2.068        2
-0.421        2
-2.145        2
-1.723        2
-0.076        2
 0.842        2
-2.183        2
-0.038        2
-5.899        2
-0.842        1
-1.379        1
-5.056        1
 0.038        1
-6.320        1
-3.830        1
 0.498        1
Name: x, dtype: int64

<h3> Remove ambiguity </h3>

In [None]:
ambigous_accel = final_aligned_dataset.loc[:, ['x', 'y', 'z']]
non_ambi_index = ambigous_accel.drop_duplicates(keep=False).index.values

final_aligned_dataset = final_aligned_dataset.iloc[non_ambi_index]

In [None]:
#@title Save aligned data as csv

savepath = "/content/processed" #@param {type : 'string'}
savename = "lab_corrected.csv" #@param {type : 'string'}

import os
os.makedirs(savepath, exist_ok=True)

complete_savename = savepath + "/" + savename
final_aligned_dataset.to_csv(complete_savename, index=False)