In [1]:
import matplotlib.pyplot as plt
import pandas as pd
from pathlib import Path
from sklearn.neighbors import KNeighborsClassifier
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from sklearn.manifold import TSNE
from sklearn.cluster import KMeans

In [2]:
# Read the myopia.csv into a pandas DataFrame
csv_file = "../Resources/myopia.csv"
myopia_df = pd.read_csv(csv_file)
myopia_df.head()

Unnamed: 0,AGE,SPHEQ,AL,ACD,LT,VCD,SPORTHR,READHR,COMPHR,STUDYHR,TVHR,DIOPTERHR,MOMMY,DADMY,MYOPIC
0,6,-0.052,21.889999,3.69,3.498,14.7,45,8,0,0,10,34,1,1,1
1,6,0.608,22.379999,3.702,3.392,15.29,4,0,1,1,7,12,1,1,0
2,6,1.179,22.49,3.462,3.514,15.52,14,0,2,0,10,14,0,0,0
3,6,0.525,22.200001,3.862,3.612,14.73,18,11,0,0,4,37,0,1,1
4,5,0.697,23.290001,3.676,3.454,16.16,14,0,0,0,4,4,1,0,0


In [7]:
# Get the shape (number of columns and rows) of the DataFrame
myopia_df.shape

(618, 15)

In [8]:
# Find any missing or null values
myopia_df.isnull().sum()

AGE          0
SPHEQ        0
AL           0
ACD          0
LT           0
VCD          0
SPORTHR      0
READHR       0
COMPHR       0
STUDYHR      0
TVHR         0
DIOPTERHR    0
MOMMY        0
DADMY        0
MYOPIC       0
dtype: int64

In [9]:
# Get the column names
myopia_df.columns

Index(['AGE', 'SPHEQ', 'AL', 'ACD', 'LT', 'VCD', 'SPORTHR', 'READHR', 'COMPHR',
       'STUDYHR', 'TVHR', 'DIOPTERHR', 'MOMMY', 'DADMY', 'MYOPIC'],
      dtype='object')

In [11]:
# Find out if there are any duplicated entries
print(f"Duplicated entries: {myopia_df.duplicated().sum()}")

Duplicated entries: 0


In [3]:
# Remove the MYOPIC column from the dataset
myopia_remove = myopia_df.drop('MYOPIC', axis=1)

In [4]:
# Standardize the dataset 
# Create a scaler to standardize the data
scaler = StandardScaler()

# Train the scaler with the data.
X_scaled = scaler.fit_transform(myopia_remove)

In [5]:
# There are 14 dimensions (features/columns) in the Myopia Datasets 
X_scaled[0]

array([-0.42021911, -1.3639169 , -0.89286146,  0.48378402, -0.28144315,
       -1.0197916 ,  4.1506609 ,  1.69744958, -0.68931054, -0.67299591,
        0.18405806,  0.49830393,  0.98713773,  1.0032415 ])

In [12]:
# 2.1 Perform dimensionality reduction with PCA
 # Initialize PCA model preserving 90% of the explained variance in dimensionality reduction
pca = PCA(n_components=0.9)

# Get the principal components for the X_train_scaled
myopia_pca = pca.fit_transform(X_scaled)

# Get the explained variance ratio
pca.explained_variance_ratio_.sum()

0.9187361702915187

In [13]:
# Get the shape of features after reduction
myopia_pca.shape

(618, 10)

In [14]:
# Further reduce the dataset dimensions with t-SNE
tsne = TSNE(learning_rate=250, n_components=3, perplexity=50, verbose=1, n_iter=2500)
#Reduce dimensions
tsne_features = tsne.fit_transform(myopia_pca)



[t-SNE] Computing 151 nearest neighbors...
[t-SNE] Indexed 618 samples in 0.007s...
[t-SNE] Computed neighbors for 618 samples in 0.028s...
[t-SNE] Computed conditional probabilities for sample 618 / 618
[t-SNE] Mean sigma: 1.438866
[t-SNE] KL divergence after 250 iterations with early exaggeration: 103.625763
[t-SNE] KL divergence after 2500 iterations: 2.334309


In [15]:
# Get the shape of features after reduction
tsne_features.shape

(618, 3)

In [16]:
# Create a scatter plot of the t-SNE output. Are there distinct clusters?
plt.scatter(tsne_features[:,0],tsne_features[:,1],c=labels)

NameError: name 'labels' is not defined