# **Fetch dataset from kaggle to google colab**

In [1]:
#kaggle API token
import os
os.environ['KAGGLE_CONFIG_DIR'] = '/content/'

In [2]:
#upgrade kaggle library
!pip install --upgrade --force-reinstall --no-deps kaggle

Collecting kaggle
[?25l  Downloading https://files.pythonhosted.org/packages/3a/e7/3bac01547d2ed3d308ac92a0878fbdb0ed0f3d41fb1906c319ccbba1bfbc/kaggle-1.5.12.tar.gz (58kB)
[K     |█████▋                          | 10kB 14.1MB/s eta 0:00:01[K     |███████████▏                    | 20kB 19.9MB/s eta 0:00:01[K     |████████████████▊               | 30kB 16.2MB/s eta 0:00:01[K     |██████████████████████▎         | 40kB 12.9MB/s eta 0:00:01[K     |███████████████████████████▉    | 51kB 14.4MB/s eta 0:00:01[K     |████████████████████████████████| 61kB 3.5MB/s 
[?25hBuilding wheels for collected packages: kaggle
  Building wheel for kaggle (setup.py) ... [?25l[?25hdone
  Created wheel for kaggle: filename=kaggle-1.5.12-cp37-none-any.whl size=73053 sha256=e38aed3cf25e5341cfe0ad9da0aef242add2ac69b5c1d1f27dd9da323e57b5be
  Stored in directory: /root/.cache/pip/wheels/a1/6a/26/d30b7499ff85a4a4593377a87ecf55f7d08af42f0de9b60303
Successfully built kaggle
Installing collected packa

In [3]:
#download rcaf dataset to colab
!kaggle competitions download -c reducing-commercial-aviation-fatalities

Downloading reducing-commercial-aviation-fatalities.zip to /content
 99% 2.12G/2.13G [00:32<00:00, 71.3MB/s]
100% 2.13G/2.13G [00:33<00:00, 69.3MB/s]


In [4]:
#unzip dataset and remove zip file
!unzip \*.zip  && rm *.zip

Archive:  reducing-commercial-aviation-fatalities.zip
  inflating: sample_submission.csv   
  inflating: test.csv                
  inflating: train.csv               


# **Improting libraries**

In [1]:
from sklearn.datasets import load_iris
import pandas as pd
import joblib
import random
import numpy as np
import copy

from scipy import signal
from scipy.signal import butter, iirnotch, lfilter, sosfilt
from sklearn.preprocessing import StandardScaler


**Load pre-trained model**

In [2]:
#loaded trained LightGBM model
model = joblib.load('/content/drive/MyDrive/applied aic/case studies/rcaf/models/lgbm_model.pkl')

**Load train data**

In [3]:
#load train data
train = pd.read_csv('train.csv')

In [4]:
try:
  y = train.event
  X = train.drop(['event'], axis=1)
except Exception as ex:
  print(ex)

**Load test data**

In [30]:
#load test data
X = pd.read_csv('test.csv', nrows=100000)

# **Feature Engineering**

In [5]:
def reduce_mem_usage(df):
    """ iterate through all the columns of a dataframe and modify the data type
        to reduce memory usage.        
    """
    start_mem = df.memory_usage().sum() / 1024**2
    print('Memory usage of dataframe is {:.2f} MB'.format(start_mem))
    
    for col in df.columns:
        col_type = df[col].dtype
        
        if col_type != object:
            c_min = df[col].min()
            c_max = df[col].max()
            if str(col_type)[:3] == 'int':
                if c_min > np.iinfo(np.int8).min and c_max < np.iinfo(np.int8).max:
                    df[col] = df[col].astype(np.int8)
                elif c_min > np.iinfo(np.int16).min and c_max < np.iinfo(np.int16).max:
                    df[col] = df[col].astype(np.int16)
                elif c_min > np.iinfo(np.int32).min and c_max < np.iinfo(np.int32).max:
                    df[col] = df[col].astype(np.int32)
                elif c_min > np.iinfo(np.int64).min and c_max < np.iinfo(np.int64).max:
                    df[col] = df[col].astype(np.int64)  
            else:
                if c_min > np.finfo(np.float16).min and c_max < np.finfo(np.float16).max:
                    df[col] = df[col].astype(np.float16)
                elif c_min > np.finfo(np.float32).min and c_max < np.finfo(np.float32).max:
                    df[col] = df[col].astype(np.float32)
                else:
                    df[col] = df[col].astype(np.float64)
        else:
            df[col] = df[col].astype('category')

    end_mem = df.memory_usage().sum() / 1024**2
    print('Memory usage after optimization is: {:.2f} MB'.format(end_mem))
    print('Decreased by {:.1f}%'.format(100 * (start_mem - end_mem) / start_mem))
    
    return df


def derive_eeg_features(df):
  """
  deriving eeg features from existing eeg features
  """

  data = df
  #deriving new eeg features according to Longitudinal-Traverse Bipolar method
  #Left electrodes traverse(Fp1 to O1)
  data['eeg_fp1-f7'] = data['eeg_fp1']-data['eeg_f7']
  data['eeg_f7-t3'] = data['eeg_f7']-data['eeg_t3']
  data['eeg_t3-t5'] = data['eeg_t3']-data['eeg_t5']
  data['eeg_t5-o1'] = data['eeg_t5']-data['eeg_o1']
  data['eeg_fp1-f3'] = data['eeg_fp1']-data['eeg_f3']
  data['eeg_f3-c3'] = data['eeg_f3']-data['eeg_c3']
  data['eeg_c3-p3'] = data['eeg_c3']-data['eeg_p3']
  data['eeg_p3-o1'] = data['eeg_p3']-data['eeg_o1']
  #Central electrodes traverse (T3 to T4)
  data['eeg_t3-c3'] = data['eeg_t3']-data['eeg_c3']
  data['eeg_c3-cz'] = data['eeg_c3']-data['eeg_cz']
  data['eeg_cz-c4'] = data['eeg_cz']-data['eeg_c4']
  data['eeg_c4-t4'] = data['eeg_c4']-data['eeg_t4']
  #Right electrodes traverse (FP2 to O2)
  data['eeg_fp2-f8'] = data['eeg_fp2']-data['eeg_f8']
  data['eeg_f8-t4'] = data['eeg_f8']-data['eeg_t4']
  data['eeg_t4-t6'] = data['eeg_t4']-data['eeg_t6']
  data['eeg_t6-o2'] = data['eeg_t6']-data['eeg_o2']
  data['eeg_fp2-f4'] = data['eeg_fp2']-data['eeg_f4']
  data['eeg_f4-c4'] = data['eeg_f4']-data['eeg_c4']
  data['eeg_c4-p4'] = data['eeg_c4']-data['eeg_p4']
  data['eeg_p4-o2'] = data['eeg_p4']-data['eeg_o2']

  data = data.drop(columns=['eeg_fp1', 'eeg_f7', 'eeg_f8','eeg_t4', 'eeg_t6', 'eeg_t5', 'eeg_t3', 'eeg_fp2', 'eeg_o1', 'eeg_p3',
                        'eeg_pz', 'eeg_f3', 'eeg_fz', 'eeg_f4', 'eeg_c4', 'eeg_p4', 'eeg_poz',
                        'eeg_c3', 'eeg_cz', 'eeg_o2'], axis=1)
               
  return data

def signal_filter(signal, low=None, high=None, powerline=60, fs=None, order=None):
    """
    references-
    #https://towardsdatascience.com/getting-the-right-beat-e18acd48b8c1
    #https://docs.scipy.org/doc/scipy/reference/generated/scipy.signal.sosfilt.html#scipy.signal.sosfilt
    #https://docs.scipy.org/doc/scipy/reference/generated/scipy.signal.lfilter.html
    
    parameters-
    signal: raw signal data
    low: minimum required frequency ex. 5Hz or 50bpm
    high: maximum required frequency ex. 10Hz or 100bpm
    powerline: default Alternate current frequency 60Hz for USA
    fs: sampling rate
    order: order of the filter
    
    description-
    this is signal filter function, it filters raw signal data using scipy module.
    this function removes mainly 3 noise types - very high frequency, very low frequency and power fluctuations.
    
    
    """
    #nyquist frequency
    nyq=0.5*fs
    
    #1 handling high frequency noise
    normallized_high = high/nyq
    sos1 = butter(order, normallized_high, btype='high', analog=False, output='sos')
    x = sosfilt(sos1, signal)
    
    #2 handling low frequency noise
    normallized_low = low/nyq
    sos2 = butter(order, normallized_low, btype='low', analog=False, output='sos')
    y = sosfilt(sos2, x)
    
    #3 handling power fluctuations
    freq=powerline/nyq
    f, e = iirnotch(freq, 30)
    filtered_signal = lfilter(f, e, y)
    
    
    return filtered_signal

def remove_redundant_columns(df):
  """
  remove redundat columns from dataframe
  """
  #test data has id column
  if 'id' in df:
    df = df.drop(columns=['ecg', 'gsr', 'r', 'crew', 'experiment', 'time', 'seat', 'id'], axis=1)
  else:
    df = df.drop(columns=['ecg', 'gsr', 'r', 'crew', 'experiment', 'time', 'seat'], axis=1)
  return df

def standardize(df):
  """
  standardize columns and return as pandas dataframe
  """
  standardized = StandardScaler()
  df = standardized.fit_transform(df)

  return pd.DataFrame(df)


In [31]:
def pipeline(X):
  """
  fetaure engineering pipeline used for train and test data during prediction
  """
  #deepcopy of original dataframe
  df = copy.deepcopy(X)

  #memory optimization
  df = reduce_mem_usage(df)

  #derive eeg features
  df = derive_eeg_features(df)

  #filter ecg signal
  filtered_ecg_signal = signal_filter(df.ecg, low=0.5, high=2, fs=1000, order=5)
  df['filtered_ecg_signal'] = filtered_ecg_signal
  
  #filter gsr signal
  filtered_gsr_signal = signal_filter(df.gsr, low=0.01, high=0.18, fs=1000, order=5)
  df['filtered_gsr_signal'] = filtered_gsr_signal
  
  #filter respiration sigal
  filtered_respiration_signal = signal_filter(df.r, low=0.01, high=0.16, fs=1500, order=5)
  df['filtered_respiration_signal'] = filtered_respiration_signal
  
  #remove redundant columns
  df = remove_redundant_columns(df)
  
  #standardization
  df = standardize(df)

  return df

df = pipeline(X)

Memory usage of dataframe is 21.36 MB
Memory usage after optimization is: 5.25 MB
Decreased by 75.4%


# **Prediction**

**Prediction on test data**

In [33]:
#prediction without labels
def function1(df):  
  """
  prediction on test data without any labels
  """
  #get random sample from dataframe
  sample_row = random.sample(range(0,df.shape[0]),1)
  data = df.iloc[sample_row]
  proba=model.predict_proba(df.iloc[sample_row])
  final_pred=np.argmax(proba,axis=1)

  if final_pred == 0:
    print('Predicted cognitive state: Baseline or No event')
  elif final_pred == 1:
    print('Predicted cognitive state: Surprised or Startle')
  elif final_pred == 2:
    print('Predicted cognitive state: Channelized attention')
  else:
    print('Predicted cognitive state: Diverted attention')

pred = function1(df)

Predicted cognitive state: Baseline or No event


**Predicition on labelled data**

In [17]:
#prediction with labels
def function2(df, y):
  """
  prediction on cross validation data with labels avaialable.
  """
  #get random sample from dataframe
  sample_row = random.sample(range(0,df.shape[0]),1)
  data = df.iloc[sample_row]
  proba=model.predict_proba(df.iloc[sample_row])

  final_pred=np.argmax(proba,axis=1)
  if final_pred == 0:
    print('Predicted cognitive state: Baseline or No event')
  elif final_pred == 1:
    print('Predicted cognitive state: Surprised or Startle')
  elif final_pred == 2:
    print('Predicted cognitive state: Channelized attention')
  else:
    print('Predicted cognitive state: Diverted attention')

  ground_truth = y.iloc[sample_row].values
  if ground_truth == 'A':
    print('Actual cognitive state: Baseline or No event')
  elif ground_truth == 'B':
    print('Actual cognitive state: Surprised or Startle')
  elif ground_truth == 'C':
    print('Actual cognitive state: Channelized attention')
  else:
    print('Actual cognitive state: Diverted attention')

pred = function2(df, y)

Predicted cognitive state: Channelized attention
Actual cognitive state: Channelized attention
