In [None]:
import numpy as np
from scipy.signal import savgol_filter
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn import preprocessing
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import precision_score, recall_score, f1_score, accuracy_score, balanced_accuracy_score,classification_report, confusion_matrix
from sklearn.model_selection import KFold, GridSearchCV
from sklearn.model_selection import train_test_split

import lightgbm as lgb
from sklearn.utils import shuffle

In [None]:
from google.colab import drive

drive.mount("/content/drive")

Mounted at /content/drive


##Smoothing the Data

## Function for Smoothing the Data

In [None]:
def SG(df_sg_temp):
  #Column to filter value is the column value
  window_size = 11
  poly_order = 2
  df_sg_temp = savgol_filter(df_sg_temp, window_size, poly_order)
  return df_sg_temp

In [None]:
#Function for Spike removal
def SR(sr_temp):
  window_size = 5
  threshold = 0.5

  windowCv = []
  windowMean = []

  for i in range(len(sr_temp)):
      if i<int(window_size/2):
          thisCv = np.std(sr_temp[:i+int(window_size/2)+1])/np.mean(sr_temp[:i+int(window_size/2)+1])
          thisMean = np.mean(sr_temp[:i+int(window_size/2)+1])

      elif len(sr_temp)-1-i<int(window_size/2):
          thisCv = np.std(sr_temp[i-int(window_size/2):])/np.mean(sr_temp[i-int(window_size/2):])
          thisMean = np.mean(sr_temp[i-int(window_size/2):])
  
      else:
          thisCv = np.std(sr_temp[i-int(window_size/2):i+int(window_size/2)+1])/np.mean(sr_temp[i-int(window_size/2):i+int(window_size/2)+1])
          thisMean = np.mean(sr_temp[i-int(window_size/2):i+int(window_size/2)+1])

      windowCv.append(thisCv)
      windowMean.append(thisMean)

  windowCv = np.array(windowCv)   
  windowMean = np.array(windowMean)
  cutoff = np.mean(windowCv) + threshold * (np.max(windowCv) - np.mean(windowCv))
  sr_temp_spectra = np.where(windowCv > cutoff, windowMean, sr_temp)

  return sr_temp_spectra

In [None]:
def getStandardScaledSpectra(spectra):
    
    targetSpectra = spectra.reshape(-1,1)
    scaler = StandardScaler()
    targetSpectra = scaler.fit_transform(targetSpectra)
    targetSpectra = (targetSpectra.reshape(1,-1))[0]
    
    return targetSpectra

def getStandardScaledData(data, mask=None):
    
    if mask is None: 
      mask = np.full(data.shape, False).ravel()

    Ndata = np.ones(data.shape)
    
    for i in range(data.shape[0]):
        if mask[i]: 
          continue
        
        Ndata[i] = getStandardScaledSpectra(data[i])
    
    return np.array(Ndata)

In [None]:
done_list = []
for i in range(19, 40):
  if i == 22 or i == 0:
    continue
  done_list.append(i)

In [None]:
# done_list.remove(2)
# print(done_list)

In [None]:
for i in done_list:

  # Mineral
  df_mineral = pd.read_csv(f"/content/drive/MyDrive/Data and Resources for Major Project/Labels Classified Data (CRISM)/{i}.csv")
  df_mineral = df_mineral.T

  # Importing Wavelength as df
  df_W = pd.read_csv("/content/drive/MyDrive/Data and Resources for Major Project/Labels Classified Data (CRISM)/Wavelength.csv")
  df_W=df_W.T
  df_W.columns=['Wavelength']

  L = len(df_mineral.columns)

  # Preprocessing

  # Savitzky-Golay filter
  for j in range(0, L):
    df_mineral[j] = SG(df_mineral[j])

  # Spike Removal
  
  for k in range(0, L):
    df_mineral[k] = SR(df_mineral[k].to_numpy())

  # Spectra Standardization

  for l in range(0, L):
    df_mineral[l] = getStandardScaledSpectra(df_mineral[l].to_numpy())

  # Exporting
  df_mineral = df_mineral.T
  df_mineral.to_csv(f"/content/drive/MyDrive/Data and Resources for Major Project/Pre_Processed_Minerals/{i}.csv",index=False)

  thisCv = np.std(sr_temp[:i+int(window_size/2)+1])/np.mean(sr_temp[:i+int(window_size/2)+1])
  thisCv = np.std(sr_temp[i-int(window_size/2):i+int(window_size/2)+1])/np.mean(sr_temp[i-int(window_size/2):i+int(window_size/2)+1])
  thisCv = np.std(sr_temp[i-int(window_size/2):])/np.mean(sr_temp[i-int(window_size/2):])
