## Reformatting MIR Spectral Data


### Setup

In [None]:
import numpy as np; print('numpy Version:', np.__version__)
import pandas as pd; print('pandas Version:', pd.__version__)

### Reading Dataset

In [None]:
file = 'SQL_extracted datasets/FINALlanghill_mir_extractready_revised_7pm_mir_aligned_20210711.csv'
data_2 = pd.read_csv('SQL_extracted datasets/Formatted_data/ZERODAYS_langhill_mir_extractready2_0am_mir_aligned_20210727.csv', sep=',')

data_ = pd.read_csv(file, sep=',')
print(data_.shape) #Shape is 417657 rows and 18 columns

#Remove all rows for which there was no aligned spectral data
data_.dropna(subset=['spectra'], inplace=True)
print(data_.shape) 

data_['weeklyave_FI'].describe() 

#Break spectra from long format into individual columns
spectra_ = data_['spectra'].str.split(',', n=1060, expand=True)
print(spectra_.shape)

#Rename each spectral wavepoint and append to the data table
for i in range(1060):
      data_['wp%s' % (i+1)] = spectra_[i] 
print(data_.shape)

#Drop the spectra column as it is now represented in the other columns
data_.drop(columns=['spectra'], inplace=True)
print(data_.shape)

#Data is now read in and correctly aligned
data_.head() 

### Data Formatting

In [None]:
#Reformatting all wavepoint columns
#Formatting as float32 - 32 is preferred for GPU

data_.iloc[:, 14:1704] = data_.iloc[:, 14:1074].apply(lambda x: x.astype('float32')) #Long runtime


### Data Cleaning
Errors common in Spectral Data <br>
Causes: - Measurement or input error - Data corruption - True outlier observation<br>
Methods - Standard Deviation (For normally distributed traits) - IQR


In [None]:
#Outlier Removal
from statsmodels.graphics.gofplots import qqplot

qqplot(data_['FI'], line ='s')
from scipy.stats import shapiro
# normality test
stat, p = shapiro(data_['FI'])
print('Statistics=%.3f, p=%.3f' % (stat, p))
# interpret
alpha = 0.05
if p > alpha:
    print('Sample looks Gaussian (fail to reject H0)')
else:
    print('Sample does not look Gaussian (reject H0)')
# D'Agostino and Pearson's Test
from scipy.stats import normaltest
#normality test
stat, p = normaltest(data_['FI'])
print('Statistics=%.3f, p=%.3f' % (stat, p))
# interpret
alpha = 0.05
if p > alpha:
    print('Sample looks Gaussian (fail to reject H0)')
else:
    print('Sample does not look Gaussian (reject H0)')

from scipy.stats import anderson
# normality test
result = anderson(data_['FI'])
print('Statistic: %.3f' % result.statistic)
p = 0
for i in range(len(result.critical_values)):
    sl, cv = result.significance_level[i], result.critical_values[i]
    if result.statistic < result.critical_values[i]:
        print('%.3f: %.3f, data looks normal (fail to reject H0)' % (sl, cv))
    else:
        print('%.3f: %.3f, data does not look normal (reject H0)' % (sl, cv))
# IQR Method
# Identify outliers that are a factor of k of the IQR below the 25th perc or above 75th perc
# A factor of k of 3 or more can be used to identify values that are extreme outliers
# On a boxplot these limits are the whiskers at the end of the boxplot lines. (Outliers are dots)

#Calculate IQR
q25, q75 = np.percentile(data_['weeklyave_FI'], 25) , np.percentile(data_['weeklyave_FI'], 75)
iqr = q75 - q25
print('Q25 = %.3f' % (q25))
print('Q75 = %.3f' % (q75))
print('IQR = %.3f' % (iqr))
Q25 = 37.468
Q75 = 55.957
IQR = 18.490
#Calculate the outlier cutoff
cut_off = iqr * 1.5
lower, upper = q25 - cut_off, q75 + cut_off

#Identify outliers
outliers = data_[(data_.weeklyave_FI > upper) | (data_.weeklyave_FI < lower)]
#Remove outliers
data_cleaned = data_[(data_.weeklyave_FI <= upper) & (data_.weeklyave_FI >= lower)]
print('Total observations: %d' % len(data_))

print('Identified outliers: %d' % len(outliers))
print('Non-outlier observations: %d' % len(data_cleaned))


### Data Export

In [None]:
fileName = file.split("/")[1]
fileName = fileName.split(".")[0]
data_cleaned.to_csv('SQL_extracted datasets/Formatted_data/' +fileName + 'GENETIC.csv', index = False)