## Import the library 

In [1]:
# import the libraries
import pandas as pd
import numpy as np
from scipy.io.arff import loadarff
from matplotlib import pyplot as plt

## Preprocess the Data - EEG Eye State data

Load the EEG Eye State data

In [2]:
# load the data. The function loadarff read most arff files and it can also read
# files with missing data, representing the data points as NaNs. This 
# information is important for data preprocessing. The data used here 
# has no missing values
EEG_Eye_State, meta = loadarff('EEGEyeState.arff')

# number of attributes are 14
# number of samples are 14980

In [3]:
# meta contains information about the arff file, as shown below is the attributes
meta

Dataset: EEG_DATA
	AF3's type is numeric
	F7's type is numeric
	F3's type is numeric
	FC5's type is numeric
	T7's type is numeric
	P7's type is numeric
	O1's type is numeric
	O2's type is numeric
	P8's type is numeric
	T8's type is numeric
	FC6's type is numeric
	F4's type is numeric
	F8's type is numeric
	AF4's type is numeric
	eyeDetection's type is nominal, range is ('0', '1')

In [4]:
# EEG_Eye_State records the data of the arff file, accessible by attribute names
# When add the EEG_Eye_State data to matrix, each element in the matrix has the type numpy.bytes_, therefore need to convert to
# float or int type so data matrix could be manipulated without errors
# Turn EEG_Eye_State into matrix of data
Eye_State_data = np.array(EEG_Eye_State[meta.names()[0]].astype(float, copy = True)).reshape(14980,1)

# Add every input vector to Eye_State_data
for i in range(1,len(meta.names())-1):
    Eye_State_data = np.c_[Eye_State_data, np.array(EEG_Eye_State[meta.names()[i]]).astype(float, copy = True)]

# Label vector, including 0 or 1
Eye_State_label = np.array(EEG_Eye_State[meta.names()[len(meta.names())-1]].astype(int, copy = True)).reshape(14980,1)

In [5]:
print(Eye_State_data[0:3,:])

[[4329.23 4009.23 4289.23 4148.21 4350.26 4586.15 4096.92 4641.03 4222.05
  4238.46 4211.28 4280.51 4635.9  4393.85]
 [4324.62 4004.62 4293.85 4148.72 4342.05 4586.67 4097.44 4638.97 4210.77
  4226.67 4207.69 4279.49 4632.82 4384.1 ]
 [4327.69 4006.67 4295.38 4156.41 4336.92 4583.59 4096.92 4630.26 4207.69
  4222.05 4206.67 4282.05 4628.72 4389.23]]


## Preprocess the Data - Autism Adult data

Load the Autism Adult data

In [6]:
# load the data. The function loadarff read most arff files and it can also read
# files with missing data, representing the data points as NaNs. This 
# information is important for data preprocessing. The data used here 
# has some missing values. Need to find those 'missing data samples'.
Autism_Adult, meta = loadarff('AutismAdultData.arff')

# number of attributes are 21
# number of samples are 704

In [7]:
# meta contains information about the arff file, as shown below is the attributes
meta

Dataset: adult-weka.filters.unsupervised.attribute.NumericToNominal-Rfirst-10
	A1_Score's type is nominal, range is ('0', '1')
	A2_Score's type is nominal, range is ('0', '1')
	A3_Score's type is nominal, range is ('0', '1')
	A4_Score's type is nominal, range is ('0', '1')
	A5_Score's type is nominal, range is ('0', '1')
	A6_Score's type is nominal, range is ('0', '1')
	A7_Score's type is nominal, range is ('0', '1')
	A8_Score's type is nominal, range is ('0', '1')
	A9_Score's type is nominal, range is ('0', '1')
	A10_Score's type is nominal, range is ('0', '1')
	age's type is numeric
	gender's type is nominal, range is ('f', 'm')
	ethnicity's type is nominal, range is ('White-European', 'Latino', 'Others', 'Black', 'Asian', "'Middle Eastern '", 'Pasifika', "'South Asian'", 'Hispanic', 'Turkish', 'others')
	jundice's type is nominal, range is ('no', 'yes')
	austim's type is nominal, range is ('no', 'yes')
	contry_of_res's type is nominal, range is ("'United States'", 'Brazil', 'Spain',

In [8]:
# Autism_Adult records the data of the arff file, accessible by attribute names
# When add the Autism_Adult data to matrix, each element in the matrix has the type numpy.bytes_, therefore need to convert to
# int or str type so data matrix could be manipulated without errors
# Turn Autism_Adult into matrix of data
Autism_Adult_data = np.array(Autism_Adult[meta.names()[0]].astype(int, copy = True)).reshape(704,1)

# Add every integer input vector to Eye_State_data
for i in range(1,11):
    Autism_Adult_data = np.c_[Autism_Adult_data, np.array(Autism_Adult[meta.names()[i]]).astype(int, copy = True)]

# Add every string input vector to Eye_State_data
for i in range(11,17):
    Autism_Adult_data = np.c_[Autism_Adult_data, np.array(Autism_Adult[meta.names()[i]]).astype(str, copy = True)]

# Add integer input vector to Eye_State_data, 18th column
Autism_Adult_data = np.c_[Autism_Adult_data, np.array(Autism_Adult[meta.names()[17]]).astype(int, copy = True)]

for i in range(18,len(meta.names())):
    Autism_Adult_data = np.c_[Autism_Adult_data, np.array(Autism_Adult[meta.names()[i]]).astype(str, copy = True)]

# convert to DataFrame so manipulation be easily done 
Autism_frame = pd.DataFrame(data = Autism_Adult_data, columns = meta.names()[:])

# replace '?' with NaN, help to find columns of missing values
Autism_frame.replace('?',np.NaN, inplace = True)

# show the first 10 rows of data frame
Autism_frame.head(10)

Unnamed: 0,A1_Score,A2_Score,A3_Score,A4_Score,A5_Score,A6_Score,A7_Score,A8_Score,A9_Score,A10_Score,...,gender,ethnicity,jundice,austim,contry_of_res,used_app_before,result,age_desc,relation,Class/ASD
0,1,1,1,1,0,0,1,1,0,0,...,f,White-European,no,no,'United States',no,6,'18 and more',Self,NO
1,1,1,0,1,0,0,0,1,0,1,...,m,Latino,no,yes,Brazil,no,5,'18 and more',Self,NO
2,1,1,0,1,1,0,1,1,1,1,...,m,Latino,yes,yes,Spain,no,8,'18 and more',Parent,YES
3,1,1,0,1,0,0,1,1,0,1,...,f,White-European,no,yes,'United States',no,6,'18 and more',Self,NO
4,1,0,0,0,0,0,0,1,0,0,...,f,,no,no,Egypt,no,2,'18 and more',,NO
5,1,1,1,1,1,0,1,1,1,1,...,m,Others,yes,no,'United States',no,9,'18 and more',Self,YES
6,0,1,0,0,0,0,0,1,0,0,...,f,Black,no,no,'United States',no,2,'18 and more',Self,NO
7,1,1,1,1,0,0,0,0,1,0,...,m,White-European,no,no,'New Zealand',no,5,'18 and more',Parent,NO
8,1,1,0,0,1,0,0,1,1,1,...,m,White-European,no,no,'United States',no,6,'18 and more',Self,NO
9,1,1,1,1,0,1,1,1,1,0,...,m,Asian,yes,yes,Bahamas,no,8,'18 and more','Health care professional',YES


__Discussion__ about preprocessing this data. As can be seen from the table above, there are some missing data points in the table. There are a few approaches: first, the missing values could take the _mean_ along the axis, here the axis should be the column. Second, the missing values might take the _median_ along the column or third, the _most frequent_. Unfortunately, all the missing values are in the _categorical_ input, thus _mean_ and _median_ could not be used. Mean of 'Latino' and 'Black' would make no numerical nor categorical sense here. The final solution is to use the _most frequent_. In fact, the row of missing values could be eliminated but given that there are only 704 samples and some information of that row might be very important to our model, elimination seems not to be a good idea.

Print the columns that have the missing values. As shown in table above, missing values are marked "NaN"

In [9]:
# print the columns that has the missing values
print(Autism_frame.columns[Autism_frame.isnull().any()])

Index(['ethnicity', 'relation'], dtype='object')


_ethnicity_ is 13th columns and _relation_ is 20th column. Now find the most frequent value is each column and assign that value to the missing values

In [10]:
# find the mode ( or most frequent value ) in each column
autism_mode = Autism_frame.mode(axis=0)

# assign most frequent value in column 'ethnicity' to the missing values
Autism_frame[meta.names()[12]].replace(np.NaN, autism_mode[meta.names()[12]].values[0],inplace = True)

# assign most frequent value in column 'relation' to the missing values
Autism_frame[meta.names()[19]].replace(np.NaN, autism_mode[meta.names()[19]].values[0],inplace = True)

Since there are categorical data in our matrix, there is a need to encode them so ML algorithms could be implemented on.

In [11]:
# Encoding categorical data
from sklearn.preprocessing import LabelEncoder
labelEncoder_X = LabelEncoder()
# encode only categorical attributes
for i in range(11,len(meta.names())):
    # since column 'result' is numeric, no encoding is needed for column 18th
    if i != 17: 
        Autism_frame[meta.names()[i]] = labelEncoder_X.fit_transform(Autism_frame[meta.names()[i]])

# Create data matrix for Autism Adult data
Autism_matrix = Autism_frame.iloc[:,:-1].values
Autism_label = Autism_frame.iloc[:,20].values
# Convert to integers
Autism_matrix = Autism_matrix.astype(int, copy = True)
Autism_label = Autism_label.astype(int, copy = True)

## Preprocess the Data - Air Quality UCI

Load the Air Quality UCI data

In [12]:
# load the data. The function read_csv read excel files and it can also read
# files with missing data, representing the data points as NaNs. This 
# information is important for data preprocessing. The data used here 
# has some missing values. Need to find those 'missing data samples'.
Air_Quality = pd.read_excel('AirQualityUCI.xlsx')

# show the first 10 rows of data frame
Air_Quality.head(10)

Unnamed: 0,Date,Time,CO(GT),PT08.S1(CO),NMHC(GT),C6H6(GT),PT08.S2(NMHC),NOx(GT),PT08.S3(NOx),NO2(GT),PT08.S4(NO2),PT08.S5(O3),T,RH,AH
0,2004-03-10,18:00:00,2.6,1360.0,150,11.881723,1045.5,166.0,1056.25,113.0,1692.0,1267.5,13.6,48.875001,0.757754
1,2004-03-10,19:00:00,2.0,1292.25,112,9.397165,954.75,103.0,1173.75,92.0,1558.75,972.25,13.3,47.7,0.725487
2,2004-03-10,20:00:00,2.2,1402.0,88,8.997817,939.25,131.0,1140.0,114.0,1554.5,1074.0,11.9,53.975,0.750239
3,2004-03-10,21:00:00,2.2,1375.5,80,9.228796,948.25,172.0,1092.0,122.0,1583.75,1203.25,11.0,60.0,0.786713
4,2004-03-10,22:00:00,1.6,1272.25,51,6.518224,835.5,131.0,1205.0,116.0,1490.0,1110.0,11.15,59.575001,0.788794
5,2004-03-10,23:00:00,1.2,1197.0,38,4.741012,750.25,89.0,1336.5,96.0,1393.0,949.25,11.175,59.175,0.784772
6,2004-03-11,00:00:00,1.2,1185.0,31,3.624399,689.5,62.0,1461.75,77.0,1332.75,732.5,11.325,56.775,0.760312
7,2004-03-11,01:00:00,1.0,1136.25,31,3.326677,672.0,62.0,1453.25,76.0,1332.75,729.5,10.675,60.0,0.770238
8,2004-03-11,02:00:00,0.9,1094.0,24,2.339416,608.5,45.0,1579.0,60.0,1276.0,619.5,10.65,59.674999,0.764819
9,2004-03-11,03:00:00,0.6,1009.75,19,1.696658,560.75,-200.0,1705.0,-200.0,1234.75,501.25,10.25,60.200001,0.751657


__Discussion__ about preprocessing this data. The columns 'Date' and 'Time' are not used, therefore discarded. The missing values in this data are marked '-200'. There are a few approaches: first, the missing values could take the _mean_ along the axis, here the axis should be the column. Second, the missing values might take the _median_ along the column or third, the _most frequent_. Since the data's type is numerical, the _mean_ and _median_ can be used. Here _mean_ approach is utilized.

In [13]:
# Delete columns 'Date' and 'Time'
Air_Quality.drop(columns = ['Date', 'Time'], inplace = True)

In [14]:
# Count how many missing values does this data have
print("Column    Number of Missing Values")
      
for i in range(len(Air_Quality.columns)):
    count = (Air_Quality[Air_Quality.columns[i]]==-200).value_counts()[1]
    print("{}         {}".format(i+1,count))

Column    Number of Missing Values
1         1683
2         366
3         8443
4         366
5         366
6         1639
7         366
8         1642
9         366
10         366
11         366
12         366
13         366


The sensor responsible for 'NMHC (GT)' must be really bad, yielding too many missing or bad values. Therefore, need to discard from data. the 1st, 6th and 8th also yield many bad values, however, we would use the _mean_ approach dicussed above to replace the bad values. 11th column is the output temperature and it has 366 bad values. For the output, using the approaches above would not be a good idea. Also, 366 appears too often, therefore, the measurements must be wrong at those missing values. Eliminating all the rows that have missing values at output should be the optimal choice.

In [15]:
# show some rows that have missing values marked by '-200'
Air_Quality[Air_Quality['T'] == -200].head(10)

Unnamed: 0,CO(GT),PT08.S1(CO),NMHC(GT),C6H6(GT),PT08.S2(NMHC),NOx(GT),PT08.S3(NOx),NO2(GT),PT08.S4(NO2),PT08.S5(O3),T,RH,AH
524,1.7,-200.0,222,-200.0,-200.0,99.0,-200.0,72.0,-200.0,-200.0,-200.0,-200.0,-200.0
525,1.9,-200.0,197,-200.0,-200.0,108.0,-200.0,81.0,-200.0,-200.0,-200.0,-200.0,-200.0
526,2.3,-200.0,319,-200.0,-200.0,131.0,-200.0,93.0,-200.0,-200.0,-200.0,-200.0,-200.0
701,2.0,-200.0,137,-200.0,-200.0,129.0,-200.0,106.0,-200.0,-200.0,-200.0,-200.0,-200.0
702,2.4,-200.0,189,-200.0,-200.0,154.0,-200.0,109.0,-200.0,-200.0,-200.0,-200.0,-200.0
703,1.8,-200.0,159,-200.0,-200.0,118.0,-200.0,97.0,-200.0,-200.0,-200.0,-200.0,-200.0
704,1.0,-200.0,80,-200.0,-200.0,69.0,-200.0,83.0,-200.0,-200.0,-200.0,-200.0,-200.0
705,1.0,-200.0,66,-200.0,-200.0,-200.0,-200.0,-200.0,-200.0,-200.0,-200.0,-200.0,-200.0
706,1.0,-200.0,87,-200.0,-200.0,97.0,-200.0,79.0,-200.0,-200.0,-200.0,-200.0,-200.0
707,0.9,-200.0,79,-200.0,-200.0,145.0,-200.0,84.0,-200.0,-200.0,-200.0,-200.0,-200.0


In [16]:
# Delete the rows that have missing values at the output column vector
Air_Quality.drop(index = Air_Quality[Air_Quality['T'] == -200].index, inplace = True)

# Delete columns 'NMHC(GT)''
Air_Quality.drop(columns = ['NMHC(GT)'], inplace = True)

After dropping unnecessary column and row, count the number of missing values again. It can be easily
seen that our assumption was right, "366 appears too often, therefore, the measurements must be wrong at those missing values. Eliminating all the rows that have missing values at output should be the optimal choice."

In [17]:
# Count how many missing values does this data have
print("Column    Number of Missing Values")     
for i in range(len(Air_Quality.columns)):
    count = Air_Quality.shape[0] - (Air_Quality[Air_Quality.columns[i]]==-200).value_counts().to_dict()[0]
    print("{}         {}".format(i+1,count))

Column    Number of Missing Values
1         1647
2         0
3         0
4         0
5         1595
6         0
7         1598
8         0
9         0
10         0
11         0
12         0


Whether to drop column 1, 5 and 7 or not would somehow affect out Machine Learning model. Dropping 1595 samples is dropping 2/9 our data. Dropping columns 1,5 and 7 leaves us with 8 attributes. Which drop gives us more total benefit is worth considering. Here, I choose to drop 1595 rows and first column.

In [18]:
# Delete the rows that have missing values at the output column vector 
Air_Quality.drop(index = Air_Quality[Air_Quality['NOx(GT)'] == -200].index, inplace = True)

# Delete columns 'CO(GT)'
Air_Quality.drop(columns = ['CO(GT)'], inplace = True)

# number of attributes are 10
# number of samples are 7396

In [19]:
# Count how many missing values does this data have
print("Column    Number of Missing Values")
      
for i in range(len(Air_Quality.columns)):
    count = Air_Quality.shape[0] - (Air_Quality[Air_Quality.columns[i]]==-200).value_counts().to_dict()[0]
    print("{}         {}".format(i+1,count))

Column    Number of Missing Values
1         0
2         0
3         0
4         0
5         0
6         3
7         0
8         0
9         0
10         0
11         0


For column 6, the _mean_ approach is utilized

In [20]:
# Create data matrix for Air Qyality data
Air_Quality_matrix = Air_Quality.iloc[:,Air_Quality.columns != 'T'].values
Air_Quality_values = Air_Quality['T'].values

# Taking care of missing values
from sklearn.preprocessing import Imputer
imputer = Imputer(missing_values = -200, strategy = 'mean', axis = 0)
imputer = imputer.fit(Air_Quality_matrix[:,5:6])
Air_Quality_matrix[:,5:6] = imputer.transform(Air_Quality_matrix[:,5:6])