## Principal component analysis (PCA)

In [1]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [2]:
import numpy as np
import pandas as pd

import matplotlib.pyplot as plt
import pylab as plt
import seaborn as sb
from IPython.display import Image
from IPython.core.display import HTML 
from pylab import rcParams

import sklearn
from sklearn import datasets

In [3]:
from sklearn import decomposition
from sklearn.decomposition import PCA

In [4]:
%matplotlib inline
rcParams['figure.figsize'] = 5, 4
sb.set_style('whitegrid')

### PCA on the iris dataset

In [5]:
#Import labels (these will the target variables we will use to predict)
labels = pd.read_csv("/content/drive/MyDrive/ENG 4000/labels.csv") 
# labels.head()

In [6]:
#Import features (these will the be data we use to predict the labels)
lifetime = pd.read_csv("/content/drive/MyDrive/ENG 4000/lifetime.csv")
# lifetime.head() #Head allows us to show the first 4 rows of the data

In [7]:
spectrum = pd.read_csv("/content/drive/MyDrive/ENG 4000/spectrum.csv")
# spectrum.head()

In [8]:
scattering = pd.read_csv("/content/drive/MyDrive/ENG 4000/scattering.csv")
# scattering.head()

In [9]:
size = pd.read_csv("/content/drive/MyDrive/ENG 4000/size.csv")
size.head()

Unnamed: 0.1,Unnamed: 0,size
0,0,12.513989
1,1,19.461646
2,2,25.726931
3,3,35.178985
4,4,4.672308


In [10]:
lifetime_features = pd.read_csv("/content/drive/MyDrive/ENG 4000/lifetime_features.csv")
# lifetime_features.head()

In [11]:
data = {'scattering':scattering,}
pd.set_option('display.max_rows',None)
for x in data:
  print(x)
  print(f'Nulls: {(data[x][data[x].isna().any(axis=1)].sum() > 0).sum()}')
  print(f'Duplicates: {data[x].duplicated().sum()}\n')

  #No null features so we will not need to adjust our dataset

scattering
Nulls: 0
Duplicates: 0



In [12]:
labels.info() #Check the data types and how many nulls in the labels table

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 9866 entries, 0 to 9865
Data columns (total 2 columns):
 #   Column     Non-Null Count  Dtype
---  ------     --------------  -----
 0   Sample ID  9866 non-null   int64
 1   Pollen     9866 non-null   int64
dtypes: int64(2)
memory usage: 154.3 KB


In [13]:
labels['Pollen'].unique() #There are 11 unique values for pollen, which correlates to 12 different types of pollen

array([ 0,  1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11])

In [14]:
#Courtesy of Randy
features = pd.DataFrame()

for x in data:
  features = pd.concat([features,data[x].iloc[: , 1:]], axis=1)

# features.head()

#Thanks Randy!!

In [15]:
results = scattering
# results.head(25)

In [16]:
results.drop(['Sample ID'], axis=1, inplace=True)
# results.head(25)

In [17]:
# from keras.datasets import fashion_mnist
# (a_train,b_train),(a_test,b_test) = fashion_mnist.load_data()

In [18]:
# This removes outliers
# feature_names = []
# for col in features.columns:
#     feature_names.append(col)
# thresh = 3
# data = results.copy()
# for feat in feature_names:
#     mean = np.mean(data[feat])
#     std = np.std(data[feat]) 
#     for x in data[feat]:
#         z = (x-mean)/std
#         if z > thresh:
#             data[feat] = data[feat].replace(x,mean)

In [19]:
#Reshaping each row of scattering into a 60 by 40 array to put into Conv2D layer
results.shape
len(results.iloc[0])

reformatted = results.iloc[0].to_numpy().reshape(60,40)
for i in range(1,len(results)): #Creating a 3d array, Dimensions are: width, height and #of elements
  a = results.iloc[i].to_numpy().reshape(60,40)
  reformatted = np.dstack((a, reformatted)) #stacking each element into the array

reformatted = np.transpose(reformatted) #Reformatting to dimensions (#of elements, height, width)
reformatted.shape


(9866, 40, 60)

In [20]:
# Will create function later
# def Conv2DFormat(dataframe, labels, width, height): #dataframe is the table you want to reformat, in the form of a pandas dataframe, returns a 
#   reformatted = results.iloc[0].to_numpy().reshape(width, height) #labels is the target variables
#   for i in range(1,len(results)):
#     a = results.iloc[i].to_numpy().reshape(60,40)
#     reformatted = np.dstack((a, reformatted))
#     reformatted = np.transpose(reformatted)
#   temp = pd.DataFrame(labels, labels.columns)
#   y = labels['Pollen'] # we are using channel as target variable
#   X = reformatted#results.drop(['Pollen'], axis=1)
#   for i in y: #Creating a 3d array, format is [#each row of dataframe, width, height]
#     temp2 = np.zeros(12) 
#     temp2[i] = 1
#     temp = temp.append(pd.DataFrame(temp2.reshape(1,-1), columns=list(temp)), ignore_index=True) 
#   y = temp
#   return X, y

In [21]:
type(reformatted)

numpy.ndarray

In [55]:
temp = pd.DataFrame(labels['Pollen'], columns=["0", "1", "2","3","4","5","6","7","8","9","10","11"])

In [56]:
y = labels['Pollen'] # we are using channel as target variable
X = reformatted#results.drop(['Pollen'], axis=1)

In [57]:
for i in y: #converting labels into a dataframe for train split, TODO: Do I need to do this?
  temp2 = np.zeros(12)
  temp2[i] = 1
  temp = temp.append(pd.DataFrame(temp2.reshape(1,-1), columns=list(temp)), ignore_index=True)
y = temp

In [25]:
# pca = decomposition.PCA() #instantiates PCA object
# X_pca = pca.fit_transform(X) #performs dimension reduction on x and fits it to the PCA model

# temp = pca.explained_variance_ratio_ #this ratio tells us how much information is compressed into the first few components
#                               #we can use this to calculate cumulative variance, with this we can figure out how many components to keep
#                               #we need to make sure we keep at least 70% of the original data set's information

In [26]:
# print(temp[:10])

In [27]:
# temp[:100].sum()

In [58]:
img_rows, img_cols = reformatted.shape[1],reformatted.shape[2]

In [59]:
X= X.reshape(X.shape[0],img_rows,img_cols,1)
input_shape=(img_rows,img_cols,1)
input_shape

(40, 60, 1)

In [60]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, shuffle = True, test_size=0.4, random_state=42)
X_test, X_Val, y_test, y_Val = train_test_split(X_test, y_test, shuffle = True, test_size=0.5, random_state=42)

In [29]:
# from sklearn.decomposition import PCA
# import tensorflow as tf
# pca = PCA(n_components = 0.95)
# pca.fit_transform(X_train)
# pca.fit_transform(X_test)
# pca.fit_transform(X_Val)
# pca.fit_transform(y_train)
# pca.fit_transform(y_test)
# pca.fit_transform(y_Val)
# y_train = pd.DataFrame(y_train.reshape(len(y_train),1))
# y_test = pd.DataFrame(y_test.reshape(len(y_test),1))
# y_train = tf.keras.utils.to_categorical(y_train,12)
# y_test = tf.keras.utils.to_categorical(y_test,12)
# y_Val = tf.keras.utils.to_categorical(y_Val, 12)

In [61]:
from tensorflow.keras.models import Sequential
from keras.layers import Dense, Dropout, BatchNormalization, Activation
from tensorflow.keras.optimizers import SGD
from keras.metrics import Precision, Recall

# model = Sequential()
# model.add(tf.keras.layers.Input(shape = 2629,))
# model.add(tf.keras.layers.Dense(20,activation = 'relu'))
# model.add(tf.keras.layers.Dense(10,activation = 'relu'))
# model.add(tf.keras.layers.Dense(12,activation = 'relu'))
# model.add(tf.keras.layers.Dropout(0.5))

# model.add(tf.keras.layers.Dense(12,activation = 'softmax'))
# model.compile(optimizer =SGD(lr = 0.01,momentum=0.9), loss = "categorical_crossentropy", metrics = ["accuracy", Precision(), Recall()])#metrics = ["accuracy", Precision(), Recall()]

In [31]:
# input_shape = (reformatted.shape[1],reformatted.shape[2])
# input_shape
# The convolutional layer of the first scattering block had 10
# filters with the kernel size of 5 × 5 while the second one had
# 20 filters with the kernel size of 3×3.

In [73]:
from sklearn.decomposition import PCA
import tensorflow as tf
# model = tf.keras.Sequential([
#     tf.keras.layers.Conv2D(10,kernel_size=(5,5),activation='relu',input_shape=(40,60)),
#     tf.keras.layers.Conv2D(20,kernel_size=(3,3),activation='relu',input_shape=(40,60)),
#     tf.keras.layers.MaxPooling2D(pool_size=(2,2)),
#     tf.keras.layers.Flatten(),
#     tf.keras.layers.Dense(128,activation='relu'),
#     tf.keras.layers.Dropout(0.5),
#     tf.keras.layers.Dense(12, activation="softmax")
#     ])

model = tf.keras.Sequential([
    tf.keras.layers.Conv2D(10,kernel_size=(5,5),activation='relu',input_shape=input_shape),
    tf.keras.layers.MaxPooling2D(pool_size=(4,4)),
    tf.keras.layers.Conv2D(20,kernel_size=(3,3),activation='relu',input_shape=input_shape),
    tf.keras.layers.MaxPooling2D(pool_size=(2,2)),
    tf.keras.layers.Flatten(),
    tf.keras.layers.Dense(32,activation='relu'),
    tf.keras.layers.Dropout(0.5),
    tf.keras.layers.Dense(12, activation="softmax")
    ])

# #Define the model
# model = Sequential()
# model.add(Conv2D(32,kernel_size=(3,3),activation='relu',input_shape=input_shape))
# #model.add(MaxPooling2D(pool_size=(2,2)))   #Removing MaxPooling layer: Add accuracy but reduces training speed
# model.add(Conv2D(64,kernel_size=(3,3),activation='relu'))
# model.add(MaxPooling2D(pool_size=(2,2)))
# model.add(Flatten())
# model.add(Dense(128,activation='relu'))
# model.add(Dropout(0.5))
# model.add(Dense(num_classes,activation='softmax'))
# model.summary()



In [75]:
# model.compile(optimizer =tf.keras.optimizers.Adadelta, loss = "categorical_crossentropy", metrics = ["accuracy", Precision(), Recall()])#metrics = ["accuracy", Precision(), Recall()] 
# model.compile(optimizer =SGD(lr = 0.01,momentum=0.9), loss = "categorical_crossentropy", metrics = ["accuracy", Precision(), Recall()])#metrics = ["accuracy", Precision(), Recall()]
model.compile(loss='categorical_crossentropy', optimizer='adam', metrics = ["accuracy", Precision(), Recall()])

In [36]:

# model.compile(loss = "categorical_crossentropy",
#               optimizer=tf.keras.optimizers.Adadelta,
#               metrics = ["accuracy", Precision(), Recall()])

In [37]:
# import pickle
# model = pickle.load(open('/content/drive/MyDrive/ENG 4000/model.pkl', 'rb'))

In [38]:
# # valuesToPredict = X
# # pca.fit_transform(valuesToPredict)
# output = model.predict(X_Val)

In [76]:
history = model.fit(X_train,y_train,batch_size=32, epochs =1000, verbose =1, validation_data=(X_Val, y_Val))

Epoch 1/1000
Epoch 2/1000
Epoch 3/1000
Epoch 4/1000
Epoch 5/1000
Epoch 6/1000
Epoch 7/1000
Epoch 8/1000
Epoch 9/1000
Epoch 10/1000
Epoch 11/1000
Epoch 12/1000
Epoch 13/1000
Epoch 14/1000
Epoch 15/1000
Epoch 16/1000
Epoch 17/1000
Epoch 18/1000
Epoch 19/1000
Epoch 20/1000
Epoch 21/1000
Epoch 22/1000
Epoch 23/1000
Epoch 24/1000
Epoch 25/1000
Epoch 26/1000
Epoch 27/1000
Epoch 28/1000
Epoch 29/1000
Epoch 30/1000
Epoch 31/1000
Epoch 32/1000
Epoch 33/1000
Epoch 34/1000
Epoch 35/1000
Epoch 36/1000
Epoch 37/1000
Epoch 38/1000
Epoch 39/1000
Epoch 40/1000
Epoch 41/1000
Epoch 42/1000
Epoch 43/1000
Epoch 44/1000
Epoch 45/1000
Epoch 46/1000
Epoch 47/1000
Epoch 48/1000
Epoch 49/1000
Epoch 50/1000
Epoch 51/1000
Epoch 52/1000
Epoch 53/1000
Epoch 54/1000
Epoch 55/1000
Epoch 56/1000
Epoch 57/1000
Epoch 58/1000
Epoch 59/1000
Epoch 60/1000
Epoch 61/1000
Epoch 62/1000
Epoch 63/1000
Epoch 64/1000
Epoch 65/1000
Epoch 66/1000
Epoch 67/1000
Epoch 68/1000
Epoch 69/1000
Epoch 70/1000
Epoch 71/1000
Epoch 72/1000
E

In [None]:
# valuesToPredict = X
# pca.fit_transform(valuesToPredict)
# output = model.predict(valuesToPredict)

In [77]:
import pickle
pickle.dump(model, open('/content/drive/MyDrive/ENG 4000/Experimental/model_Conv2D.pkl', 'wb'))



In [None]:
import pandas as pd
from sklearn.decomposition import PCA
def create():
  labels = pd.read_csv("/content/drive/MyDrive/ENG 4000/labels.csv") 
  lifetime = pd.read_csv("/content/drive/MyDrive/ENG 4000/lifetime.csv")
  spectrum = pd.read_csv("/content/drive/MyDrive/ENG 4000/spectrum.csv")
  scattering = pd.read_csv("/content/drive/MyDrive/ENG 4000/scattering.csv")
  size = pd.read_csv("/content/drive/MyDrive/ENG 4000/size.csv")
  lifetime_features = pd.read_csv("/content/drive/MyDrive/ENG 4000/lifetime_features.csv")
  data = {'lifetime':lifetime,
          'spectrum':spectrum,
          'scattering':scattering,
          'size':size,
          'lifetime_features':lifetime_features}
  pd.set_option('display.max_rows',None)

  features = pd.DataFrame()

  for x in data:
    features = pd.concat([features,data[x].iloc[: , 1:]], axis=1)

  results = pd.concat([labels, features],axis=1)
  results.drop(['Sample ID'], axis=1, inplace=True)
  X = results.drop(['Pollen'], axis=1)
  valuesToPredict = X
  pca = PCA(n_components = 0.95)
  pca.fit_transform(valuesToPredict)
  return valuesToPredict

In [None]:
# !pip install cloud-sql-python-connector==0.9.3
# !pip install pg8000
import pickle
valuesToPredict = create()
model = pickle.load(open('/content/drive/MyDrive/ENG 4000/model.pkl', 'rb'))
output = model.predict(valuesToPredict)

#Working code, modified by Randy and Jimmy to be compatable with new SQL database

from google.cloud.sql.connector import Connector, IPTypes
import sqlalchemy

def getconn():
    with Connector() as connector:
        conn = connector.connect(
            "potent-comfort-376221:northamerica-northeast2:the-pollen-project", 
            "pg8000",
            user="postgres",
            password="lLyl3\"{xg9`X*t`Q",
            db="postgres",
            ip_type=IPTypes.PUBLIC 
        )
    return conn

pool = sqlalchemy.create_engine(
    "postgresql+pg8000://",
    creator=getconn,
)

with pool.connect() as db_conn:

    for X in output:
        cursor.execute("INSERT INTO model_values VALUES (" + "'" + str(X[0]) + "', " + "'" + str(X[1]) + "', " + "'" + str(X[2]) + "', " + "'" + str(X[3])
      + "', " + "'" + str(X[4]) + "', " + "'" + str(X[5]) + "', " + "'" + str(X[6]) + "', " + "'" + str(X[7]) + "', " + "'" + str(X[8]) + "', " + "'" +
        str(X[9]) + "', " + "'" + str(X[10]) + "', " + "'" + str(X[11]) + "');")

In [None]:
# true = 0
# false = 0
# import numpy as np
# import pandas as pd
# labels = pd.read_csv("/content/drive/MyDrive/ENG 4000/labels.csv")

# for i in range(len(output)):
#   if (np.argmax(output[i]) == labels.iloc[i][1]):
#     true += 1
#   else:
#     false += 1
# print(f"True {true}") #This value is high because a lot of these data points were used to train the model, using the validation data set, it's around 25% accurate
# print(f"False {false}")

In [None]:
# import csv

# with open("data.csv","w+") as my_csv:
#     csvWriter = csv.writer(my_csv,delimiter=',')
#     csvWriter.writerows(output)
