## Principal component analysis (PCA)

In [1]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [2]:
import numpy as np
import pandas as pd

import matplotlib.pyplot as plt
import pylab as plt
import seaborn as sb
from IPython.display import Image
from IPython.core.display import HTML 
from pylab import rcParams

import sklearn
from sklearn import datasets

In [3]:
from sklearn import decomposition
from sklearn.decomposition import PCA

In [4]:
from sklearn.model_selection import train_test_split
from sklearn.decomposition import PCA
import tensorflow as tf
from tensorflow.keras.models import Sequential
from keras.layers import Dense, Dropout, BatchNormalization, Activation
from tensorflow.keras.optimizers import SGD
from keras.metrics import Precision, Recall
import keras.backend as K

import pickle
from typing import Optional 

In [5]:
print("Num GPUs Available: ", len(tf.config.list_physical_devices('GPU')))
!nvidia-smi

Num GPUs Available:  1
Fri Mar 17 05:30:17 2023       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 525.85.12    Driver Version: 525.85.12    CUDA Version: 12.0     |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|                               |                      |               MIG M. |
|   0  Tesla T4            Off  | 00000000:00:04.0 Off |                    0 |
| N/A   65C    P0    28W /  70W |      3MiB / 15360MiB |      0%      Default |
|                               |                      |                  N/A |
+-------------------------------+----------------------+----------------------+
                                                                               
+----------------------------------------------------------------

### PCA on the iris dataset

In [6]:
#Import labels (these will the target variables we will use to predict)
labels = pd.read_csv("/content/drive/MyDrive/ENG 4000/labels.csv") 
lifetime = pd.read_csv("/content/drive/MyDrive/ENG 4000/lifetime.csv")
spectrum = pd.read_csv("/content/drive/MyDrive/ENG 4000/spectrum.csv")
scattering = pd.read_csv("/content/drive/MyDrive/ENG 4000/scattering.csv")
size = pd.read_csv("/content/drive/MyDrive/ENG 4000/size.csv")
lifetime_features = pd.read_csv("/content/drive/MyDrive/ENG 4000/lifetime_features.csv")

In [7]:
def processData(data, pollenNum=0, remove=False): #Takes code and removes outliers - Input is panda dataframe, output X is the data and y is it's label after processing
  features = pd.DataFrame()

  for x in data:
    features = pd.concat([features,data[x].iloc[: , 1:]], axis=1)

  results = pd.concat([labels, features],axis=1)
  results.drop(['Sample ID'], axis=1, inplace=True)

  if remove == True:
    results.drop(results[results["Pollen"] == pollenNum].index, inplace=True)

  feature_names = []
  for col in features.columns:
      feature_names.append(col)
  thresh = 3
  data = results.copy()
  for feat in feature_names:
      mean = np.mean(data[feat])
      std = np.std(data[feat]) 
      for x in data[feat]:
          z = (x-mean)/std
          if z > thresh:
              data[feat] = data[feat].replace(x,mean) #Remove point if greater than thresh, or change mean to mean of only inliers
              
  y = results['Pollen'] # we are using channel as target variable
  X = results.drop(['Pollen'], axis=1)

  temp = pd.DataFrame(results['Pollen'], columns=["0", "1", "2","3","4","5","6","7","8","9","10","11"])
  for i in y: #for every row results['Pollen'] (this gets the pollen number), we will create a table with a column for each pollen number and set that column to 1
    temp2 = np.zeros(len(temp.columns))
    temp2[i] = 1
    temp = pd.concat([temp, pd.DataFrame(temp2.reshape(1,-1), columns=list(temp))], ignore_index=True)
  y = temp
  
  X_train, X_test, y_train, y_test = train_test_split(X, y, shuffle = True, test_size=0.4, random_state=42)
  X_test, X_Val, y_test, y_Val = train_test_split(X_test, y_test, shuffle = True, test_size=0.5, random_state=42)

  pca = PCA(n_components = .995)
  pca.fit_transform(X_train)
  pca.fit_transform(X_test)
  pca.fit_transform(X_Val)
  pca.fit_transform(y_train)
  pca.fit_transform(y_test)
  pca.fit_transform(y_Val)
  return X_train, X_test, X_Val, y_train, y_test, y_Val

  

In [8]:
def get_f1(y_true, y_pred): #taken from old keras source code
    true_positives = K.sum(K.round(K.clip(y_true * y_pred, 0, 1)))
    possible_positives = K.sum(K.round(K.clip(y_true, 0, 1)))
    predicted_positives = K.sum(K.round(K.clip(y_pred, 0, 1)))
    precision = true_positives / (predicted_positives + K.epsilon())
    recall = true_positives / (possible_positives + K.epsilon())
    f1_val = 2*(precision*recall)/(precision+recall+K.epsilon())
    return f1_val

In [9]:
def evaluate(output, test_labels):
  true = 0
  false = 0
  percentResults = {0:0,1:0,2:0,3:0,4:0,5:0,6:0,7:0,8:0,9:0,10:0,11:0}
  trueResults = {0:0,1:0,2:0,3:0,4:0,5:0,6:0,7:0,8:0,9:0,10:0,11:0}
  totalResults = {0:0,1:0,2:0,3:0,4:0,5:0,6:0,7:0,8:0,9:0,10:0,11:0}

  for i in range(len(output)):
    prediction = np.argmax(output[i])
    actual = np.argmax(test_labels.iloc[i])
    totalResults.update({prediction: totalResults[prediction] + 1})
    if (prediction == actual):
      true += 1
      trueResults.update({prediction: trueResults[prediction] + 1})
      percentResults.update({prediction: np.round((trueResults[prediction] + 1)/(totalResults[prediction]),3)})
    else:
      false += 1
      percentResults.update({prediction: np.round(trueResults[prediction]/(totalResults[prediction]),3)})
  print(f"True {true}") 
  print(f"False {false}")
  print(f"{true/(true+false)*100}%") #This value is high because a lot of these data points were used to train the model
  print(f'True Results: {trueResults}')
  print(f'Percent Overall: {percentResults}')
  print(f'Total Predictions: {totalResults}')

In [10]:
def loadModel(fileName):
  custom_objects = {"get_f1": get_f1}
  with tf.keras.utils.custom_object_scope(custom_objects):
    with open('/content/drive/MyDrive/ENG 4000/Models/'+ fileName , 'rb') as file:
      model = pickle.load(file)
  return model

In [11]:
def saveModel(model,fileName):
  with open('/content/drive/MyDrive/ENG 4000/Models/'+ fileName , 'wb') as file:
    pickle.dump(model,file)
  return model

In [12]:
#Input layer size
#All data - 2629
#No lifetime - 2533
#Only scattering - 2400
def generateData(case):
  if case == 'all':
    data = {'lifetime':lifetime,
            'spectrum':spectrum,
            'scattering':scattering,
            'size':size,
            'lifetime_features':lifetime_features}

  elif case == 'scattering':
    data = {'scattering':scattering}

  elif case == 'no_lifetime':
    data = {'spectrum':spectrum,
            'scattering':scattering,
            'size':size,
            'lifetime_features':lifetime_features}

  return data

  
  

In [13]:
def makeModel():
  #Model 16 -  - PCA(n_components = .995)
  model = tf.keras.Sequential([
    tf.keras.layers.Input(2533), #lifetime
    tf.keras.layers.Dense(4096),
    tf.keras.layers.BatchNormalization(),
    tf.keras.layers.Dropout(0.2),
    tf.keras.layers.Dense(2048),
    tf.keras.layers.BatchNormalization(),
    tf.keras.layers.Dropout(0.2),
    tf.keras.layers.Activation("relu"),
    tf.keras.layers.Dense(1024),
    tf.keras.layers.BatchNormalization(),
    tf.keras.layers.Dropout(0.4),
    tf.keras.layers.Dense(512),
    tf.keras.layers.BatchNormalization(),
    tf.keras.layers.Dropout(0.5),
    tf.keras.layers.Dense(256),
    tf.keras.layers.BatchNormalization(),
    tf.keras.layers.Dropout(0.5),
    tf.keras.layers.Dense(128),
    tf.keras.layers.BatchNormalization(),
    tf.keras.layers.Dropout(0.5),
    tf.keras.layers.Dense(64),
    tf.keras.layers.BatchNormalization(),
    tf.keras.layers.Dropout(0.5),
    tf.keras.layers.Dense(32),
    tf.keras.layers.BatchNormalization(),
    tf.keras.layers.Dropout(0.5),
    tf.keras.layers.Dense(16),
    tf.keras.layers.BatchNormalization(),
    tf.keras.layers.Dropout(0.5),
    tf.keras.layers.Dense(7, activation="softmax")
    ])
  return model
  

In [14]:
def run():
  data = generateData('no_lifetime')
  for i in range(12):
    X_train, X_test, X_Val, y_train, y_test, y_Val = processData(data,i,True)
    model = makeModel()
    model.compile(optimizer =SGD(lr = 0.001,momentum=0.98), loss = "categorical_crossentropy", metrics = ["accuracy", Precision(), Recall(), get_f1])#metrics = ["accuracy", Precision(), Recall()]
    history = model.fit(X_train,y_train,batch_size=256, epochs =1000, verbose =1, validation_data=(X_Val, y_Val))
    # saveModel(model,'PCA_V16.pkl')
    # model = loadModel('PCA_V16.pkl')
    output = model.predict(X_test)
    print(f'Removed Pollen: {i}')
    evaluate(output, y_test)


In [14]:
def temp(data, pollenNum=0, remove=False): #Takes code and removes outliers - Input is panda dataframe, output X is the data and y is it's label after processing
  features = pd.DataFrame()

  for x in data:
    features = pd.concat([features,data[x].iloc[: , 1:]], axis=1)

  results = pd.concat([labels, features],axis=1)
  results.drop(['Sample ID'], axis=1, inplace=True)

  results.loc[results["Pollen"].isin([1, 2, 3, 5, 8, 10]), "Pollen"] = 10 
  
  feature_names = []
  for col in features.columns:
      feature_names.append(col)
  thresh = 3
  data = results.copy()
  for feat in feature_names:
      mean = np.mean(data[feat])
      std = np.std(data[feat]) 
      for x in data[feat]:
          z = (x-mean)/std
          if z > thresh:
              data[feat] = data[feat].replace(x,mean) #Remove point if greater than thresh, or change mean to mean of only inliers
              
  y = results['Pollen'] # we are using channel as target variable
  X = results.drop(['Pollen'], axis=1)

  temp = pd.DataFrame(results['Pollen'], columns=["0","4","6","7","9","10","11"]) 
  for i in y: #for every row results['Pollen'] (this gets the pollen number), we will create a table with a column for each pollen number and set that column to 1
    temp2 = np.zeros(len(temp.columns))
    temp2[temp.columns.get_loc(str(i))] = 1      
    temp = pd.concat([temp, pd.DataFrame(temp2.reshape(1,-1), columns=list(temp))], ignore_index=True)
  y = temp
  # y = np.argmax(y.values, axis=1)
  # return X, y #X = pandas dataframe, y = numpy array
  X_train, X_test, y_train, y_test = train_test_split(X, y, shuffle = True, test_size=0.4, random_state=42)
  X_test, X_Val, y_test, y_Val = train_test_split(X_test, y_test, shuffle = True, test_size=0.5, random_state=42)

  pca = PCA(n_components = .995)
  pca.fit_transform(X_train)
  pca.fit_transform(X_test)
  pca.fit_transform(X_Val)
  pca.fit_transform(y_train)
  pca.fit_transform(y_test)
  pca.fit_transform(y_Val)
  return X_train, X_test, X_Val, y_train, y_test, y_Val

In [104]:
def temp2(data, pollenNum=0, remove=False): #Takes code and removes outliers - Input is panda dataframe, output X is the data and y is it's label after processing
  features = pd.DataFrame()

  for x in data:
    features = pd.concat([features,data[x].iloc[: , 1:]], axis=1)

  results = pd.concat([labels, features],axis=1)
  results.drop(['Sample ID'], axis=1, inplace=True)

  results.loc[results["Pollen"].isin([1, 2, 3, 5, 8, 10]), "Pollen"] = 10 
  
  # feature_names = []
  # for col in features.columns:
  #     feature_names.append(col)
  # thresh = 3
  # data = results.copy()
  # for feat in feature_names:
  #     mean = np.mean(data[feat])
  #     std = np.std(data[feat]) 
  #     for x in data[feat]:
  #         z = (x-mean)/std
  #         if z > thresh:
  #             data[feat] = data[feat].replace(x,mean) #Remove point if greater than thresh, or change mean to mean of only inliers
              
  y = results['Pollen'] # we are using channel as target variable
  X = results.drop(['Pollen'], axis=1)

  temp = pd.DataFrame(results['Pollen'], columns=["0","4","6","7","9","10","11"]) 
  for i in y: #for every row results['Pollen'] (this gets the pollen number), we will create a table with a column for each pollen number and set that column to 1
    temp2 = np.zeros(len(temp.columns))
    temp2[temp.columns.get_loc(str(i))] = 1      
    temp = pd.concat([temp, pd.DataFrame(temp2.reshape(1,-1), columns=list(temp))], ignore_index=True)
  y = temp
  # y = np.argmax(y.values, axis=1)
  return X, y #X = pandas dataframe, y = numpy array
  # X_train, X_test, y_train, y_test = train_test_split(X, y, shuffle = True, test_size=0.4, random_state=42)
  # X_test, X_Val, y_test, y_Val = train_test_split(X_test, y_test, shuffle = True, test_size=0.5, random_state=42)

  # pca = PCA(n_components = .995)
  # pca.fit_transform(X_train)
  # pca.fit_transform(X_test)
  # pca.fit_transform(X_Val)
  # pca.fit_transform(y_train)
  # pca.fit_transform(y_test)
  # pca.fit_transform(y_Val)
  # return X_train, X_test, X_Val, y_train, y_test, y_Val

In [68]:
from sklearn.model_selection import StratifiedKFold

def performKfold(X, y):
    seed = np.random.seed(7)
    kfold = StratifiedKFold(n_splits=10, shuffle=True, random_state=seed)
    cvscores = []
    for train, test in kfold.split(X, y):
        X_train,X_test=X[train],X[test]
        y_train,y_test=y[train],y[test]

        # create model
        model = makeModel()

        # Compile model
        model.compile(optimizer =SGD(learning_rate = 0.001,momentum=0.98), loss = "categorical_crossentropy", metrics = ["accuracy", Precision(), Recall(), get_f1])#metrics = ["accuracy", Precision(), Recall()]

        # Fit the model
        model.fit(X_train,y_train, validation_split=0.33, epochs=500, batch_size=256, verbose=1)

        # evaluate the model
        scores = model.evaluate(X[test], y[test], verbose=0)
        print("%s: %.2f%%" % (model.metrics_names[1], scores[1]*100))
        cvscores.append(scores[1] * 100)
    print("%.2f%% (+/- %.2f%%)" % (np.mean(cvscores), np.std(cvscores)))


In [None]:
# def temp(data, pollenNum=0, remove=False): #Takes code and removes outliers - Input is panda dataframe, output X is the data and y is it's label after processing
#   features = pd.DataFrame()

#   for x in data:
#     features = pd.concat([features,data[x].iloc[: , 1:]], axis=1)

#   results = pd.concat([labels, features],axis=1)
#   results.drop(['Sample ID'], axis=1, inplace=True)

#   results.drop(results[results["Pollen"] == 1].index, inplace=True)
#   results.drop(results[results["Pollen"] == 3].index, inplace=True)
#   results.drop(results[results["Pollen"] == 5].index, inplace=True)
#   results.drop(results[results["Pollen"] == 8].index, inplace=True)
#   results.drop(results[results["Pollen"] == 10].index, inplace=True)

#   feature_names = []
#   for col in features.columns:
#       feature_names.append(col)
#   thresh = 3
#   data = results.copy()
#   for feat in feature_names:
#       mean = np.mean(data[feat])
#       std = np.std(data[feat]) 
#       for x in data[feat]:
#           z = (x-mean)/std
#           if z > thresh:
#               data[feat] = data[feat].replace(x,mean) #Remove point if greater than thresh, or change mean to mean of only inliers
              
#   y = results['Pollen'] # we are using channel as target variable
#   X = results.drop(['Pollen'], axis=1)

#   temp = pd.DataFrame(results['Pollen'], columns=["0", "1", "2","3","4","5","6","7","8","9","10","11"])
#   for i in y: #for every row results['Pollen'] (this gets the pollen number), we will create a table with a column for each pollen number and set that column to 1
#     temp2 = np.zeros(12)
#     temp2[i] = 1
#     temp = pd.concat([temp, pd.DataFrame(temp2.reshape(1,-1), columns=list(temp))], ignore_index=True)
#   y = temp
  
#   X_train, X_test, y_train, y_test = train_test_split(X, y, shuffle = True, test_size=0.4, random_state=42)
#   X_test, X_Val, y_test, y_Val = train_test_split(X_test, y_test, shuffle = True, test_size=0.5, random_state=42)

#   pca = PCA(n_components = .995)
#   pca.fit_transform(X_train)
#   pca.fit_transform(X_test)
#   pca.fit_transform(X_Val)
#   pca.fit_transform(y_train)
#   pca.fit_transform(y_test)
#   pca.fit_transform(y_Val)
#   return X_train, X_test, X_Val, y_train, y_test, y_Val

In [105]:
data = generateData('no_lifetime')
x, y = temp2(data)
x = x.values
# x = x.reset_index(drop=True)  # Reset the index to a range of integers
performKfold(x,y)
# model = makeModel()
# model.compile(optimizer =SGD(learning_rate = 0.001,momentum=0.98), loss = "categorical_crossentropy", metrics = ["accuracy", Precision(), Recall(), get_f1])#metrics = ["accuracy", Precision(), Recall()]
# history = model.fit(X_train,y_train,batch_size=256, epochs =1000, verbose =1, validation_data=(X_Val, y_Val))
# saveModel(model,'PCA_V20.pkl')
# model = loadModel('PCA_V20.pkl')
# output = model.predict(X_test)
# print(f'Removed Pollen: {i}')
# evaluate(output, y_test)

ValueError: ignored

In [82]:
print(y)

[0 0 0 ... 6 6 6]


In [45]:
data = generateData("no_lifetime")
features = pd.DataFrame()

for x in data:
  features = pd.concat([features,data[x].iloc[: , 1:]], axis=1)

results = pd.concat([labels, features],axis=1)
results.drop(['Sample ID'], axis=1, inplace=True)

results.loc[results["Pollen"].isin([1, 2, 3, 5, 8, 10]), "Pollen"] = 10 

            
y = results['Pollen'] # we are using channel as target variable
X = results.drop(['Pollen'], axis=1)

temp = pd.DataFrame(results['Pollen'], columns=["0","4","6","7","9","10","11"])

for i in y: #for every row results['Pollen'] (this gets the pollen number), we will create a table with a column for each pollen number and set that column to 1
  temp2 = np.zeros(len(temp.columns))
  temp2[temp.columns.get_loc(str(i))] = 1   
  temp = pd.concat([temp, pd.DataFrame(temp2.reshape(1,-1), columns=list(temp))], ignore_index=True)
y = temp
# y = np.argmax(y.values, axis=1)

In [98]:
X, y = temp(data)
y = y.reshape(-1, 1) # reshape y to a column vector
performKfold(X, y)


KeyError: ignored

In [95]:
print(y)

[[1. 0. 0. ... 0. 0. 0.]
 [1. 0. 0. ... 0. 0. 0.]
 [1. 0. 0. ... 0. 0. 0.]
 ...
 [0. 0. 0. ... 0. 0. 1.]
 [0. 0. 0. ... 0. 0. 1.]
 [0. 0. 0. ... 0. 0. 1.]]


In [17]:
data = generateData('no_lifetime')
X_train, X_test, X_Val, y_train, y_test, y_Val = temp(data)
model = makeModel()
model.compile(optimizer =SGD(learning_rate = 0.0001,momentum=0.98), loss = "categorical_crossentropy", metrics = ["accuracy", Precision(), Recall(), get_f1])#metrics = ["accuracy", Precision(), Recall()]
history = model.fit(X_train,y_train,batch_size=1024, epochs =3000, verbose =1, validation_data=(X_Val, y_Val))
saveModel(model,'PCA_V21.pkl')
model = loadModel('PCA_V21.pkl')
output = model.predict(X_test)
# print(f'Removed Pollen: {i}')
evaluate(output, y_test)

[1;30;43mStreaming output truncated to the last 5000 lines.[0m
Epoch 689/3000
Epoch 690/3000
Epoch 691/3000
Epoch 692/3000
Epoch 693/3000
Epoch 694/3000
Epoch 695/3000
Epoch 696/3000
Epoch 697/3000
Epoch 698/3000
Epoch 699/3000
Epoch 700/3000
Epoch 701/3000
Epoch 702/3000
Epoch 703/3000
Epoch 704/3000
Epoch 705/3000
Epoch 706/3000
Epoch 707/3000
Epoch 708/3000
Epoch 709/3000
Epoch 710/3000
Epoch 711/3000
Epoch 712/3000
Epoch 713/3000
Epoch 714/3000
Epoch 715/3000
Epoch 716/3000
Epoch 717/3000
Epoch 718/3000
Epoch 719/3000
Epoch 720/3000
Epoch 721/3000
Epoch 722/3000
Epoch 723/3000
Epoch 724/3000
Epoch 725/3000
Epoch 726/3000
Epoch 727/3000
Epoch 728/3000
Epoch 729/3000
Epoch 730/3000
Epoch 731/3000
Epoch 732/3000
Epoch 733/3000
Epoch 734/3000
Epoch 735/3000
Epoch 736/3000
Epoch 737/3000
Epoch 738/3000
Epoch 739/3000
Epoch 740/3000
Epoch 741/3000
Epoch 742/3000
Epoch 743/3000
Epoch 744/3000
Epoch 745/3000
Epoch 746/3000
Epoch 747/3000
Epoch 748/3000
Epoch 749/3000
Epoch 750/3000
Epoch

In [None]:
X1 = {0: 0, 1: 0.233, 2: 0.265, 3: 0.171, 4: 0.515, 5: 0.162, 6: 0.31, 7: 0.364, 8: 0.187, 9: 0.291, 10: 0.05, 11: 0.293}
X2 = {0: 0.451, 1: 0, 2: 0.23, 3: 0.174, 4: 0.373, 5: 0.133, 6: 0.27, 7: 0.4, 8: 0.214, 9: 0.282, 10: 0.083, 11: 0.242}
X3 = {0: 0.215, 1: 0.224, 2: 0, 3: 0.324, 4: 0.29, 5: 0.142, 6: 0.296, 7: 0.347, 8: 0.223, 9: 0.311, 10: 0.095, 11: 0.25}
X4 = {0: 0.271, 1: 0.167, 2: 0.303, 3: 0, 4: 0.386, 5: 0.168, 6: 0.386, 7: 0.368, 8: 0.203, 9: 0.345, 10: 0.038, 11: 0.322}
X5 = {0: 0.245, 1: 0.261, 2: 0.182, 3: 0.136, 4: 0, 5: 0.158, 6: 0.423, 7: 0.353, 8: 0.139, 9: 0.291, 10: 0.128, 11: 0.333}
X6 = {0: 0.226, 1: 0.108, 2: 0.35, 3: 0.219, 4: 0.294, 5: 0, 6: 0.259, 7: 0.399, 8: 0.18, 9: 0.456, 10: 0.103, 11: 0.375}
X7 = {0: 0.194, 1: 0.209, 2: 0.341, 3: 0.18, 4: 0.41, 5: 0.134, 6: 0, 7: 0.397, 8: 0.273, 9: 0.299, 10: 0.128, 11: 0.257}
X8 = {0: 0.173, 1: 0.117, 2: 0.216, 3: 0.158, 4: 0.385, 5: 0.095, 6: 0.278, 7: 0, 8: 0.216, 9: 0.45, 10: 0.122, 11: 0.368}
X9 = {0: 0.377, 1: 0.312, 2: 0.214, 3: 0.142, 4: 0.328, 5: 0.153, 6: 0.386, 7: 0.423, 8: 0, 9: 0.306, 10: 0.078, 11: 0.233}
X10 = {0: 0.245, 1: 0.099, 2: 0.186, 3: 0.097, 4: 0.714, 5: 0.1, 6: 0.35, 7: 0.309, 8: 0.264, 9: 0, 10: 0.065, 11: 0.375}
X11 = {0: 0.197, 1: 0.159, 2: 0.234, 3: 0.154, 4: 0.339, 5: 0.11, 6: 0.368, 7: 0.456, 8: 0.281, 9: 0.337, 10: 0, 11: 0.343}
X12 = {0: 0.167, 1: 0.326, 2: 0.234, 3: 0.26, 4: 0.207, 5: 0.144, 6: 0.333, 7: 0.306, 8: 0.172, 9: 0.387, 10: 0.098, 11: 0}
Xtot = {i: sum(x[i] for x in [X1, X2, X3, X4, X5, X6, X7, X8, X9, X10, X11, X12])/12 for i in range(12)}

print(Xtot) #drop 1,3,5,8,10

{0: 0.23008333333333333, 1: 0.18458333333333332, 2: 0.22958333333333333, 3: 0.16791666666666663, 4: 0.3534166666666667, 5: 0.12491666666666668, 6: 0.30491666666666667, 7: 0.34349999999999997, 8: 0.19600000000000004, 9: 0.3129166666666667, 10: 0.08233333333333333, 11: 0.28258333333333335}


In [None]:
run() 



Removed Pollen: 0
True 445
False 1346
24.8464544946957%
True Results: {0: 0, 1: 41, 2: 60, 3: 20, 4: 34, 5: 30, 6: 39, 7: 127, 8: 29, 9: 33, 10: 9, 11: 23}
Percent Overall: {0: 0, 1: 0.233, 2: 0.265, 3: 0.171, 4: 0.515, 5: 0.162, 6: 0.31, 7: 0.364, 8: 0.187, 9: 0.291, 10: 0.05, 11: 0.293}
Total Predictions: {0: 0, 1: 176, 2: 230, 3: 117, 4: 68, 5: 185, 6: 129, 7: 352, 8: 155, 9: 117, 10: 180, 11: 82}




Removed Pollen: 1
True 443
False 1415
23.842841765339074%
True Results: {0: 36, 1: 0, 2: 53, 3: 35, 4: 25, 5: 28, 6: 57, 7: 57, 8: 15, 9: 87, 10: 14, 11: 36}
Percent Overall: {0: 0.451, 1: 0, 2: 0.23, 3: 0.174, 4: 0.373, 5: 0.133, 6: 0.27, 7: 0.4, 8: 0.214, 9: 0.282, 10: 0.083, 11: 0.242}
Total Predictions: {0: 82, 1: 0, 2: 235, 3: 201, 4: 67, 5: 218, 6: 211, 7: 145, 8: 70, 9: 308, 10: 168, 11: 153}




Removed Pollen: 2
True 429
False 1367
23.886414253897552%
True Results: {0: 65, 1: 19, 2: 0, 3: 10, 4: 44, 5: 34, 6: 46, 7: 59, 8: 43, 9: 52, 10: 9, 11: 48}
Percent Overall: {0: 0.215, 1: 0.224, 2: 0, 3: 0.324, 4: 0.29, 5: 0.142, 6: 0.296, 7: 0.347, 8: 0.223, 9: 0.311, 10: 0.095, 11: 0.25}
Total Predictions: {0: 302, 1: 85, 2: 0, 3: 34, 4: 155, 5: 240, 6: 159, 7: 170, 8: 197, 9: 167, 10: 95, 11: 192}




Removed Pollen: 3
True 478
False 1358
26.034858387799563%
True Results: {0: 45, 1: 8, 2: 50, 3: 0, 4: 21, 5: 30, 6: 44, 7: 63, 8: 50, 9: 82, 10: 8, 11: 77}
Percent Overall: {0: 0.271, 1: 0.167, 2: 0.303, 3: 0, 4: 0.386, 5: 0.168, 6: 0.386, 7: 0.368, 8: 0.203, 9: 0.345, 10: 0.038, 11: 0.322}
Total Predictions: {0: 166, 1: 48, 2: 165, 3: 0, 4: 57, 5: 184, 6: 114, 7: 171, 8: 246, 9: 238, 10: 208, 11: 239}




Removed Pollen: 4
True 456
False 1413
24.398073836276083%
True Results: {0: 72, 1: 18, 2: 46, 3: 8, 4: 0, 5: 18, 6: 43, 7: 114, 8: 33, 9: 57, 10: 15, 11: 32}
Percent Overall: {0: 0.245, 1: 0.261, 2: 0.182, 3: 0.136, 4: 0, 5: 0.158, 6: 0.423, 7: 0.353, 8: 0.139, 9: 0.291, 10: 0.128, 11: 0.333}
Total Predictions: {0: 294, 1: 69, 2: 253, 3: 59, 4: 0, 5: 114, 6: 104, 7: 326, 8: 238, 9: 199, 10: 117, 11: 96}




Removed Pollen: 5
True 495
False 1316
27.332965212589727%
True Results: {0: 80, 1: 15, 2: 49, 3: 16, 4: 32, 5: 0, 6: 66, 7: 105, 8: 34, 9: 41, 10: 6, 11: 51}
Percent Overall: {0: 0.226, 1: 0.108, 2: 0.35, 3: 0.219, 4: 0.294, 5: 0, 6: 0.259, 7: 0.399, 8: 0.18, 9: 0.456, 10: 0.103, 11: 0.375}
Total Predictions: {0: 354, 1: 139, 2: 140, 3: 73, 4: 109, 5: 0, 6: 255, 7: 263, 8: 194, 9: 90, 10: 58, 11: 136}




Removed Pollen: 6
True 457
False 1398
24.63611859838275%
True Results: {0: 48, 1: 28, 2: 74, 3: 46, 4: 34, 5: 21, 6: 0, 7: 85, 8: 26, 9: 29, 10: 23, 11: 43}
Percent Overall: {0: 0.194, 1: 0.209, 2: 0.341, 3: 0.18, 4: 0.41, 5: 0.134, 6: 0, 7: 0.397, 8: 0.273, 9: 0.299, 10: 0.128, 11: 0.257}
Total Predictions: {0: 248, 1: 134, 2: 217, 3: 256, 4: 83, 5: 157, 6: 0, 7: 214, 8: 99, 9: 97, 10: 179, 11: 171}




Removed Pollen: 7
True 355
False 1405
20.170454545454543%
True Results: {0: 63, 1: 17, 2: 45, 3: 9, 4: 15, 5: 26, 6: 55, 7: 0, 8: 38, 9: 27, 10: 14, 11: 46}
Percent Overall: {0: 0.173, 1: 0.117, 2: 0.216, 3: 0.158, 4: 0.385, 5: 0.095, 6: 0.278, 7: 0, 8: 0.216, 9: 0.45, 10: 0.122, 11: 0.368}
Total Predictions: {0: 364, 1: 145, 2: 208, 3: 57, 4: 39, 5: 273, 6: 198, 7: 0, 8: 176, 9: 60, 10: 115, 11: 125}




Removed Pollen: 8
True 469
False 1345
25.85446527012128%
True Results: {0: 43, 1: 20, 2: 34, 3: 15, 4: 19, 5: 37, 6: 27, 7: 114, 8: 0, 9: 96, 10: 14, 11: 50}
Percent Overall: {0: 0.377, 1: 0.312, 2: 0.214, 3: 0.142, 4: 0.328, 5: 0.153, 6: 0.386, 7: 0.423, 8: 0, 9: 0.306, 10: 0.078, 11: 0.233}
Total Predictions: {0: 114, 1: 64, 2: 159, 3: 106, 4: 61, 5: 242, 6: 70, 7: 272, 8: 0, 9: 314, 10: 193, 11: 219}




Removed Pollen: 9
True 340
False 1390
19.653179190751445%
True Results: {0: 40, 1: 24, 2: 52, 3: 6, 4: 9, 5: 20, 6: 14, 7: 118, 8: 18, 9: 0, 10: 12, 11: 27}
Percent Overall: {0: 0.245, 1: 0.099, 2: 0.186, 3: 0.097, 4: 0.714, 5: 0.1, 6: 0.35, 7: 0.309, 8: 0.264, 9: 0, 10: 0.065, 11: 0.375}
Total Predictions: {0: 163, 1: 242, 2: 279, 3: 62, 4: 14, 5: 201, 6: 40, 7: 385, 8: 72, 9: 0, 10: 200, 11: 72}




Removed Pollen: 10
True 465
False 1376
25.25801195002716%
True Results: {0: 51, 1: 17, 2: 62, 3: 34, 4: 57, 5: 16, 6: 27, 7: 51, 8: 50, 9: 63, 10: 0, 11: 37}
Percent Overall: {0: 0.197, 1: 0.159, 2: 0.234, 3: 0.154, 4: 0.339, 5: 0.11, 6: 0.368, 7: 0.456, 8: 0.281, 9: 0.337, 10: 0, 11: 0.343}
Total Predictions: {0: 264, 1: 107, 2: 265, 3: 228, 4: 168, 5: 146, 6: 76, 7: 114, 8: 178, 9: 187, 10: 0, 11: 108}




Removed Pollen: 11
True 406
False 1339
23.266475644699142%
True Results: {0: 37, 1: 29, 2: 36, 3: 13, 4: 12, 5: 26, 6: 24, 7: 139, 8: 32, 9: 41, 10: 17, 11: 0}
Percent Overall: {0: 0.167, 1: 0.326, 2: 0.234, 3: 0.26, 4: 0.207, 5: 0.144, 6: 0.333, 7: 0.306, 8: 0.172, 9: 0.387, 10: 0.098, 11: 0}
Total Predictions: {0: 221, 1: 89, 2: 154, 3: 50, 4: 58, 5: 181, 6: 72, 7: 454, 8: 186, 9: 106, 10: 174, 11: 0}


In [None]:
data = generateData('no_lifetime')
X_train, X_test, X_Val, y_train, y_test, y_Val = temp(data)
model = makeModel()
model.compile(optimizer =SGD(learning_rate = 0.0001,momentum=0.98), loss = "categorical_crossentropy", metrics = ["accuracy", Precision(), Recall(), get_f1])#metrics = ["accuracy", Precision(), Recall()]
history = model.fit(X_train,y_train,batch_size=256, epochs =2000, verbose =1, validation_data=(X_Val, y_Val))
saveModel(model,'PCA_V19.pkl')
model = loadModel('PCA_V19.pkl')
output = model.predict(X_test)
# print(f'Removed Pollen: {i}')
evaluate(output, y_test)

Epoch 1/2000
Epoch 2/2000
Epoch 3/2000
Epoch 4/2000
Epoch 5/2000
Epoch 6/2000
Epoch 7/2000
Epoch 8/2000
Epoch 9/2000
Epoch 10/2000
Epoch 11/2000
Epoch 12/2000
Epoch 13/2000
Epoch 14/2000
Epoch 15/2000
Epoch 16/2000
Epoch 17/2000
Epoch 18/2000
Epoch 19/2000
Epoch 20/2000
Epoch 21/2000
Epoch 22/2000
Epoch 23/2000
Epoch 24/2000
Epoch 25/2000
Epoch 26/2000
Epoch 27/2000
Epoch 28/2000
Epoch 29/2000
Epoch 30/2000
Epoch 31/2000
Epoch 32/2000
Epoch 33/2000
Epoch 34/2000
Epoch 35/2000
Epoch 36/2000
Epoch 37/2000
Epoch 38/2000
Epoch 39/2000
Epoch 40/2000
Epoch 41/2000
Epoch 42/2000
Epoch 43/2000
Epoch 44/2000
Epoch 45/2000
Epoch 46/2000
Epoch 47/2000
Epoch 48/2000
Epoch 49/2000
Epoch 50/2000
Epoch 51/2000
Epoch 52/2000
Epoch 53/2000
Epoch 54/2000
Epoch 55/2000
Epoch 56/2000
Epoch 57/2000
Epoch 58/2000
Epoch 59/2000
Epoch 60/2000
Epoch 61/2000
Epoch 62/2000
Epoch 63/2000
Epoch 64/2000
Epoch 65/2000
Epoch 66/2000
Epoch 67/2000
Epoch 68/2000
Epoch 69/2000
Epoch 70/2000
Epoch 71/2000
Epoch 72/2000
E