In [None]:
import numpy as np
import pandas as pd
from keras import Input, Model, layers
import matplotlib.pyplot as plt
from sklearn.decomposition import PCA
import math
from scipy import stats
from tensorflow import keras
from keras import Input, Model, layers
from scipy.linalg import eigh, cholesky
from scipy.stats import norm
from sklearn.preprocessing import MinMaxScaler

In [None]:
dataset = pd.read_csv("clean_data.csv")
X = dataset.to_numpy()
X = X[:,0:3]

# Dataset Generation

Correlated Gaussian noise

In [None]:
method = 'cholesky'

num_samples = 1000

# Covariance matrix
r = np.array([
        [1000000, 20000, 500],
        [20000,    2500, 20],
        [500,       20,    1]
    ])

# Mean vector
m = np.array([
    [5000], # Distance travelled in metres/day
    [100], # Time spent on wheelchair in minutes/day
    [2.5] # Average distance from home in km/day
])

# Generate samples from three independent normally distributed random variables (with mean 0 and std. dev. 1)
x = norm.rvs(size=(3, num_samples))

# Compute the Cholesky decomposition
c = cholesky(r, lower=True)

# Convert the data to correlated random variables. 
y = np.dot(c, x) + m
y = np.absolute(y) # to avoid negative values

In [None]:
plt.scatter(y[1,:],y[0,:], s=10, c='r')
plt.xlabel("Duration on wheelchair (minutes/day)")
plt.ylabel("Distance travelled on wheelchair (metres/day)")

In [None]:
dataset = y.transpose()

In [None]:
pd.DataFrame(dataset).to_csv("Correlated_simulated_data.csv")

In [None]:
dataset = pd.read_csv("Correlated_simulated_data.csv")
X_corr = dataset.to_numpy()
X_corr = X_corr[:, 1:4]
scaler = MinMaxScaler()
scaler.fit(X_corr)
X_corr = scaler.transform(X_corr)

# PCA

In [None]:
pca = PCA(n_components=1)
pca.fit(X_corr)

Y = pca.transform(X_corr)

def sigmoid(x):
  return 1 / (1 + np.exp(-x))

Y = sigmoid(Y)*100

In [None]:
print(Y)

In [None]:
stats.describe(Y)

In [None]:
plt.scatter(Y, np.zeros(Y.shape[0]), s=20)
plt.xlim(0, 100)
plt.show

# Autoencoder

In [None]:
input_dim = 3
latent_dim = 1

input_layer = Input(shape=(input_dim,))
encoded = layers.Dense(2, activation='relu')(input_layer)
encoded = layers.Dense(latent_dim, activation='sigmoid')(encoded)

decoded = layers.Dense(2, activation='relu')(encoded)
decoded = layers.Dense(input_dim, activation='sigmoid')(decoded)

autoencoder = Model(input_layer, decoded)

autoencoder.compile(optimizer='adam', loss='binary_crossentropy')

autoencoder.fit(X_corr, X_corr,
                epochs=500,
                batch_size=None,
                shuffle=True,
                )

encoder = Model(input_layer, encoded)

In [None]:
encoder.save(r"C:\Users\ahmed\Desktop\Year 3\Projects\GM1\Code\3GM1_team_3\Machine_Learning")

In [None]:
encoder = keras.models.load_model(r'C:\users\ahmed\Desktop\Year 3\Projects\GM1\Code\3GM1_team_3\Machine_Learning')

In [None]:
Z = encoder.predict(X_corr)*100

In [None]:
print(Z)

In [None]:
stats.describe(Z)

In [None]:
plt.scatter(Z, np.zeros(Y.shape[0]), s=20)
plt.xlim(0, 100)
plt.show

# Comparison

In [None]:
plt.scatter(Y,Z)

In [None]:
# Plotting the wellness scores against distance travelled (normalised)

i = 0

plt.scatter(X_corr[:,i],Y, label = "PCA", alpha=0.5, s=10)
plt.scatter(X_corr[:,i],Z, label = "Autoencoder", alpha=0.5, s=10)
plt.legend()
plt.xlabel("Normalised distance travelled")
plt.ylabel("Holistic Wellness Score")
plt.show

In [None]:
# Plotting the wellness scores against duration (normalised)

i = 1

plt.scatter(X_corr[:,i],Y, label = "PCA", alpha=0.5, s=10)
plt.scatter(X_corr[:,i],Z, label = "Autoencoder", alpha=0.5, s=10)
plt.legend()
plt.xlabel("Normalised duration on wheelchair")
plt.ylabel("Holistic Wellness Score")
plt.show

In [None]:
# Plotting the wellness scores against distance from home (normalised)

i = 2

plt.scatter(X_corr[:,i],Y, label = "PCA", alpha=0.5, s=10)
plt.scatter(X_corr[:,i],Z, label = "Autoencoder", alpha=0.5, s=10)
plt.legend()
plt.xlabel("Normalised average distance from home")
plt.ylabel("Holistic Wellness Score")
plt.show

Autoencoder appears to be returning an "anti-wellness" score, which can be easily corrected

In [None]:
Z_corrected = 100 - Z

In [None]:
stats.describe(Z_corrected)

In [None]:
plt.scatter(Z_corrected, np.zeros(Y.shape[0]), s=20)
plt.xlim(0, 100)
plt.show

Final comparison

In [None]:
# Plotting the wellness scores against distance travelled (normalised)

i = 0

plt.scatter(X_corr[:,i],Y, label = "PCA", alpha=0.5, s=10)
plt.scatter(X_corr[:,i],Z_corrected, label = "Autoencoder", alpha=0.5, s=10)
plt.legend()
plt.xlabel("Normalised distance travelled")
plt.ylabel("Holistic Wellness Score")
plt.show

In [None]:
# Plotting the wellness scores against duration (normalised)

i = 1

plt.scatter(X_corr[:,i],Y, label = "PCA", alpha=0.5, s=10)
plt.scatter(X_corr[:,i],Z_corrected, label = "Autoencoder", alpha=0.5, s=10)
plt.legend()
plt.xlabel("Normalised duration on wheelchair")
plt.ylabel("Holistic Wellness Score")
plt.show

In [None]:
# Plotting the wellness scores against distance from home (normalised)

i = 2

plt.scatter(X_corr[:,i],Y, label = "PCA", alpha=0.5, s=10)
plt.scatter(X_corr[:,i],Z_corrected, label = "Autoencoder", alpha=0.5, s=10)
plt.legend()
plt.xlabel("Normalised average distance from home")
plt.ylabel("Holistic Wellness Score")
plt.show

In [None]:
from mpl_toolkits.mplot3d import Axes3D
import matplotlib.pyplot as plt

fig = plt.figure(figsize=(10, 7))
ax = fig.add_subplot(111, projection='3d')

# Scatter plot
p = ax.scatter(X_corr[:, 0], X_corr[:, 1], X_corr[:, 2], c=encoded_values)

# Adding color bar
fig.colorbar(p)

ax.set_xlabel('Feature 1')
ax.set_ylabel('Feature 2')
ax.set_zlabel('Feature 3')

plt.show()
