In [1]:
# import the libraries
import pandas as pd
import numpy as np
from scipy.io.arff import loadarff
from matplotlib import pyplot as plt

### Preprocess EEG Eye State Dataset¶

In [2]:
# 14980 samples, 14 attrs, last column binary label

# load the data. The function loadarff read most arff files and it can also read
# files with missing data, representing the data points as NaNs. This 
# information is important for data preprocessing. The data used here 
# has no missing values
EEG_Eye_State, meta = loadarff('EEG-Eye-State.arff')

In [3]:
# meta contains information about the arff file, as shown below is the attributes
meta

Dataset: EEG_DATA
	AF3's type is numeric
	F7's type is numeric
	F3's type is numeric
	FC5's type is numeric
	T7's type is numeric
	P7's type is numeric
	O1's type is numeric
	O2's type is numeric
	P8's type is numeric
	T8's type is numeric
	FC6's type is numeric
	F4's type is numeric
	F8's type is numeric
	AF4's type is numeric
	eyeDetection's type is nominal, range is ('0', '1')

In [7]:
# EEG_Eye_State records the data of the arff file, accessible by attribute names
# When add the EEG_Eye_State data to matrix, each element in the matrix has the type numpy.bytes_, therefore need to convert to
# float or int type so data matrix could be manipulated without errors
# Turn EEG_Eye_State into matrix of data
Eye_State_data = np.array(EEG_Eye_State[meta.names()[0]].astype(float, copy = True)).reshape(14980,1)

# Load attributes as type float
for i in range(1,14):
    Eye_State_data = np.c_[Eye_State_data, np.array(EEG_Eye_State[meta.names()[i]]).astype(float, copy = True)]

# Load label as type int
Eye_State_data = np.c_[Eye_State_data, np.array(EEG_Eye_State[meta.names()[14]]).astype(int, copy = True)]

# Convert to pandas DataFrame for easier manipulation 
df = pd.DataFrame(data = Eye_State_data, columns = meta.names()[:])

# First 10 samples
df.head(10)

Unnamed: 0,AF3,F7,F3,FC5,T7,P7,O1,O2,P8,T8,FC6,F4,F8,AF4,eyeDetection
0,4329.23,4009.23,4289.23,4148.21,4350.26,4586.15,4096.92,4641.03,4222.05,4238.46,4211.28,4280.51,4635.9,4393.85,0.0
1,4324.62,4004.62,4293.85,4148.72,4342.05,4586.67,4097.44,4638.97,4210.77,4226.67,4207.69,4279.49,4632.82,4384.1,0.0
2,4327.69,4006.67,4295.38,4156.41,4336.92,4583.59,4096.92,4630.26,4207.69,4222.05,4206.67,4282.05,4628.72,4389.23,0.0
3,4328.72,4011.79,4296.41,4155.9,4343.59,4582.56,4097.44,4630.77,4217.44,4235.38,4210.77,4287.69,4632.31,4396.41,0.0
4,4326.15,4011.79,4292.31,4151.28,4347.69,4586.67,4095.9,4627.69,4210.77,4244.1,4212.82,4288.21,4632.82,4398.46,0.0
5,4321.03,4004.62,4284.1,4153.33,4345.64,4587.18,4093.33,4616.92,4202.56,4232.82,4209.74,4281.03,4628.21,4389.74,0.0
6,4319.49,4001.03,4280.51,4151.79,4343.59,4584.62,4089.74,4615.9,4212.31,4226.67,4201.03,4269.74,4625.13,4378.46,0.0
7,4325.64,4006.67,4278.46,4143.08,4344.1,4583.08,4087.18,4614.87,4205.64,4230.26,4195.9,4266.67,4622.05,4380.51,0.0
8,4326.15,4010.77,4276.41,4139.49,4345.13,4584.1,4091.28,4608.21,4187.69,4229.74,4202.05,4273.85,4627.18,4389.74,0.0
9,4326.15,4011.28,4276.92,4142.05,4344.1,4582.56,4092.82,4608.72,4194.36,4228.72,4212.82,4277.95,4637.44,4393.33,0.0


In [8]:
# Data matrix, and labels array from pandas DataFrame
EEG_Matrix = df.values
EEG_Data_Matrix = EEG_Matrix[:,:-1]
EEG_Data_Matrix = EEG_Data_Matrix.astype('float')
EEG_Data_Labels = EEG_Matrix[:,-1]
EEG_Data_Labels = EEG_Data_Labels.astype('int')

print("Dim(EEG_Data_Matrix) = ", EEG_Data_Matrix.shape)
print("Dim(EEG_Data_Labels) = ", EEG_Data_Labels.shape)

Dim(EEG_Data_Matrix) =  (14980, 14)
Dim(EEG_Data_Labels) =  (14980,)
