In [1]:
from numpy import genfromtxt
import pandas as pd
import numpy as np

In [2]:
len(range(1,30))

29

In [3]:
# Import integrated datasets
from numpy import genfromtxt
X = genfromtxt('/data/thijs/thesis/latent_integrated_mouse_downsampled.csv', 
               delimiter=',', skip_header= 1, usecols=range(1,31))

In [4]:
# Check dimensionality
X.shape

(49884, 30)

In [5]:
49884-6281

43603

In [6]:
# Add labels of training data
y_labels = pd.read_csv('/data/thijs/thesis/shekhar_labels_downsampled.csv', usecols=[2]) 
y_labels = np.array(y_labels)
len(y_labels)

6281

In [7]:
# Just a check
y_labels[1200]

array([4])

In [8]:
# Reshape labels
y_labels = y_labels.reshape(6281,)

In [9]:
type(y_labels)

numpy.ndarray

In [10]:
y_labels.shape

(6281,)

In [11]:
# 0s for unlabeled data
y_labels = np.append(y_labels, np.zeros(shape = (43603,), dtype = np.int64))

In [12]:
y_labels

array([1, 4, 4, ..., 0, 0, 0])

In [13]:
# Check if labelling in classes and non-labelled data is correct
np.unique(y_labels, return_counts=True)

(array([0, 1, 2, 3, 4, 5]), array([43603,   252,  2945,    48,  2945,    91]))

In [14]:
# Renumber labels for algorithms to work
y_labels[y_labels == 0] = -1 # unlabeled
y_labels[y_labels == 1] = 0 # amacrine
y_labels[y_labels == 2] = 1 # bipolar
y_labels[y_labels == 3] = 2 # cones
y_labels[y_labels == 4] = 3 # muller
y_labels[y_labels == 5] = 4 # rods

In [15]:
y_labels[8000]

-1

In [16]:
y_labels

array([ 0,  3,  3, ..., -1, -1, -1])

In [17]:
type(y_labels)

numpy.ndarray

In [18]:
# Create unlabeled set
unlabeled_set = np.arange(6281,49884)

In [19]:
unlabeled_set

array([ 6281,  6282,  6283, ..., 49881, 49882, 49883])

In [20]:
from sklearn.semi_supervised import LabelSpreading

In [21]:
# Learn with LabelSpreading
lp_model = LabelSpreading(max_iter=20)
lp_model.fit(X, y_labels)
transducted_labels = lp_model.transduction_[unlabeled_set]
predicted_labels = lp_model.predict(X[6281:])

  self.label_distributions_ /= normalizer
  self.label_distributions_ /= normalizer


In [22]:
np.unique(transducted_labels, return_counts=True)

(array([0, 1, 2, 3, 4]), array([ 4559,  6358,  1703,  2089, 28894]))

In [23]:
np.unique(predicted_labels, return_counts=True)

(array([0]), array([43603]))

In [24]:
transducted_labels

array([3, 4, 1, ..., 0, 4, 4])

In [25]:
lp_model.n_iter_

9

In [26]:
type(predicted_labels)

numpy.ndarray

In [27]:
lp_model.gamma

20

In [28]:
#np.savetxt("predicted_labels_new.csv", predicted_labels)

In [29]:
# Import actual labels
true_labels = pd.read_csv('/data/thijs/thesis/macosko_labels.csv', usecols=[2]) 
true_labels = np.array(true_labels)

In [30]:
true_labels

array([[4],
       [5],
       [2],
       ...,
       [1],
       [5],
       [5]])

In [31]:
true_labels[true_labels == 1] = 0 #amacrine
true_labels[true_labels == 2] = 1 #bipolar
true_labels[true_labels == 3] = 2 #cones
true_labels[true_labels == 4] = 3 #muller
true_labels[true_labels == 5] = 4 #rods

In [32]:
len(predicted_labels)

43603

In [33]:
len(true_labels)

43603

In [34]:
from sklearn.metrics import classification_report

In [37]:
# Learn with LabelSpreading, gamma = 10
lp_model = LabelSpreading(gamma=10, max_iter=20)
lp_model.fit(X, y_labels)
transducted_labels = lp_model.transduction_[unlabeled_set]
predicted_labels = lp_model.predict(X[6281:])

In [38]:
print(classification_report(true_labels, transducted_labels, 
                            target_names = ['Amacrine', 'Bipolar', 'Cones', 'Muller', 'Rods'], digits = 4))

              precision    recall  f1-score   support

    Amacrine     0.9176    0.9530    0.9349      4426
     Bipolar     0.9443    0.9585    0.9514      6285
       Cones     0.9392    0.8603    0.8980      1868
      Muller     0.7251    0.9889    0.8367      1624
        Rods     0.9827    0.9593    0.9709     29400

    accuracy                         0.9554     43603
   macro avg     0.9018    0.9440    0.9184     43603
weighted avg     0.9591    0.9554    0.9563     43603



In [39]:
print(classification_report(true_labels, predicted_labels,
                            target_names = ['Amacrine', 'Bipolar', 'Cones', 'Muller', 'Rods'], digits = 4))

              precision    recall  f1-score   support

    Amacrine     0.9176    0.9530    0.9349      4426
     Bipolar     0.9443    0.9585    0.9514      6285
       Cones     0.9392    0.8603    0.8980      1868
      Muller     0.7251    0.9889    0.8367      1624
        Rods     0.9827    0.9593    0.9709     29400

    accuracy                         0.9554     43603
   macro avg     0.9018    0.9440    0.9184     43603
weighted avg     0.9591    0.9554    0.9563     43603



In [43]:
from sklearn.semi_supervised import LabelPropagation

In [44]:
unlabeled_set = np.zeros(49884, dtype=bool)

unlabeled_set[0:6281] = False
unlabeled_set[6281:] = True

In [58]:
# LabelPropagation, gamma = 1
prop_model = LabelPropagation(gamma=1)

prop_model.fit(X, y_labels)

transducted_labels = prop_model.transduction_[unlabeled_set]

predicted_labels = prop_model.predict(X[6281:])



In [59]:
np.unique(predicted_labels, return_counts=True)

(array([0, 1, 2, 3, 4]), array([ 4210,  6233,   209,  1680, 31271]))

In [60]:
print(classification_report(true_labels, transducted_labels,
                            target_names = ['Amacrine', 'Bipolar', 'Cones', 'Muller', 'Rods'], digits = 4))

              precision    recall  f1-score   support

    Amacrine     0.9715    0.9241    0.9472      4426
     Bipolar     0.9681    0.9601    0.9641      6285
       Cones     0.9809    0.1097    0.1974      1868
      Muller     0.9446    0.9772    0.9607      1624
        Rods     0.9310    0.9903    0.9597     29400

    accuracy                         0.9410     43603
   macro avg     0.9592    0.7923    0.8058     43603
weighted avg     0.9431    0.9410    0.9265     43603



In [61]:
print(classification_report(true_labels, predicted_labels,
                            target_names = ['Amacrine', 'Bipolar', 'Cones', 'Muller', 'Rods'], digits = 4))

              precision    recall  f1-score   support

    Amacrine     0.9715    0.9241    0.9472      4426
     Bipolar     0.9681    0.9601    0.9641      6285
       Cones     0.9809    0.1097    0.1974      1868
      Muller     0.9446    0.9772    0.9607      1624
        Rods     0.9310    0.9903    0.9597     29400

    accuracy                         0.9410     43603
   macro avg     0.9592    0.7923    0.8058     43603
weighted avg     0.9431    0.9410    0.9265     43603



In [49]:
prop_model.n_iter_

1000