Linear Discriminant Analysis (LDA)

In [1]:
import numpy as np
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
import pandas as pd
from sklearn.model_selection import train_test_split

#Load data file(s)
data_filepath = '../data/embs/modelbreak/10-2-128-50-6-50/Oembslayer5-110000-120000.csv'
data = pd.read_csv(data_filepath,delimiter=',')
n_features = 128

#Define save filedir
save_filedir ='../data/embs/modelbreak/10-2-128-50-6-50/'

#2325
#3255

# Filter out classes with only one sample
class_counts = data['ldalabel'].value_counts()
single_sample_classes = class_counts[class_counts == 1].index.tolist()

data_filtered = data[~data['ldalabel'].isin(single_sample_classes)]

#Define X and y for the classification task
X = data_filtered.iloc[:,0:n_features].values
y = data_filtered['ldalabel'].values


# Split the data into training and testing sets
#X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, stratify = y, random_state=42)

In [2]:
print(X)
print(y)

[[ 1.9206810e+00 -3.2983106e-01  7.5627910e-01 ...  1.3296798e-01
  -6.8067586e-01 -1.2198466e+00]
 [ 2.0319133e+00 -4.3267850e-01  7.8603300e-01 ...  1.8094125e-01
  -6.6059010e-01 -1.3792698e+00]
 [ 1.8612910e+00 -6.4994854e-01  5.4668593e-01 ... -1.2576580e-03
  -5.9963983e-01 -1.4581711e+00]
 ...
 [ 4.1770820e+00 -4.8849255e-01  2.0907507e+00 ...  1.6687960e+00
   2.7272210e+00 -3.1752954e+00]
 [ 3.7086596e+00 -7.1281374e-01  1.5987878e+00 ...  1.0452396e+00
   1.9527593e+00 -3.1408680e+00]
 [ 2.1647525e+00 -3.4640655e-01  1.3032643e+00 ...  3.7909890e-01
   4.7228730e-01 -1.0450515e+00]]
[ 5 16  5 ...  1  1  5]


In [3]:

#Initialize linear discriminant analysis state 
#(this maintains the coefficients, intercept, priors, covariance, accuracy score,... used during fitting)
lda = LinearDiscriminantAnalysis(store_covariance=True)

#Perfoem lda fit
X_ldafit = lda.fit_transform(X, y)


In [4]:
from sklearn.metrics import classification_report

print(np.array([lda.score(X,y)]))
#Save important results (accuracy, coeffs... etc.)

y_pred = lda.predict(X)
# Print classification report which contains precision, recall, f1-score, and support per class
report = classification_report(y, y_pred)
print(report)


[0.97416262]
              precision    recall  f1-score   support

           1       1.00      1.00      1.00      8043
           3       0.88      0.91      0.89       592
           4       0.86      0.92      0.89      1183
           5       0.99      0.97      0.98      3753
          15       0.96      0.97      0.96      1092
          16       0.98      0.94      0.96       799
          17       0.96      0.94      0.95       690
          18       0.95      0.95      0.95       477
          19       0.83      0.91      0.87       199
          20       0.89      0.97      0.93        96
          21       0.91      0.98      0.95        97
          22       0.74      1.00      0.85        17
          23       0.99      1.00      0.99        69

    accuracy                           0.97     17107
   macro avg       0.92      0.96      0.94     17107
weighted avg       0.98      0.97      0.97     17107



In [None]:
print(X)
print(y)

#Test LDA with new X points

In [None]:
data = -2*np.eye(128)
print(data)


predictions = lda.predict(data)

print(predictions)


#IDEA WHY DON'T YOU MAKE AN ALGORITHM, THAT CONTINUOUSLY BLEEDS 100 OVER
#TO THE REST OF THE VECTOR IN THE COLUMN,


In [None]:

for each_class in range(25):
    print(each_class)
    w = (np.where(predictions==each_class))
    print(w)

In [None]:
print(classes)

In [None]:

#Load data file(s)
data_filepath = '../data/embs/model1-10000/layer5/Oembs/embs-NOOUTLIERSpca.csv'
data = pd.read_csv(data_filepath,delimiter=',')

#The question you thus have to ask yourself is where is the vector that takes you to the middle of the category of LDA prediction. 
#For example vector 0,0,0,0,0 vector seems to be the middle of the 2 category because once you are at around that vector 
#You will always be category 2

#How do we find that? we run all of LDA predictions of each training class, 
#get the mean vector of the predictions, 
#and see if the mean vector also gives you the same class
#And everything around the mean vector gives the same class

mean_classes = []
predictions_of_classes = [] 
for each_class in range(25):

    if each_class != 0 and each_class != 6 and each_class != 14:

        X_train_class = data[data.iloc[:,133] == each_class].values

        mean_emb_class = np.mean(X_train_class[:,:3],axis=0)

        predictions = lda.predict(mean_emb_class.reshape(1,-1))

        mean_classes.append(mean_emb_class)
        print(predictions)

print(mean_classes)

In [None]:
#Now for each mean class, figure out how much uncertainty is in that class,
mean_class_fg2 = mean_classes[0]

true_prediction = lda.predict(mean_class_fg2.reshape(1,-1))
#print(variant_vector)
print(true_prediction)

target_features = 3

#Find the variance of each embedding parameter that keeps within the same prediction
variance_step = 0.1
n_steps = 50000


Introducing a variance in each feature until the category changes. Finding out the variance for each feature around each class of prediction

In [None]:

#There are two ways we can do this (that I can think of)
#1) not-safe extreme - let each feature vary on its own, independent of the rest, until the prediction breaks
#2) safe extreme - vary ALL features at once, until a prediction breaks... take that as the minimum range for the class


#1) case
#print(mean_class_fg2)
mean_class_fg2_copy = mean_class_fg2.copy()

for each_target_feature in range(target_features):
    variant_vector = np.zeros((3))
    total_variance = 0

    for steps in range(n_steps):
        total_variance = total_variance + variance_step 
        for each_feature in range(3):
            if each_feature == each_target_feature:
                variant_vector[each_feature] = mean_class_fg2_copy[each_feature] + total_variance
            else:
                variant_vector[each_feature] = mean_class_fg2_copy[each_feature]

        prediction = lda.predict(variant_vector.reshape(1,-1))

        #    mean_classes_fg_2 = mean_classes_fg_2 + np.zeros((1,128)) 
        if prediction != true_prediction:
#            print(true_prediction)
#            print(prediction)
            break
    print(each_target_feature)
    print(total_variance)

    #see if these correspond with the values above



In [None]:
#2) case

#There are two ways we can do this (that I can think of)
#1) not-safe extreme - let each feature vary on its own, independent of the rest, until the prediction breaks
#2) safe extreme - vary ALL features at once, until a prediction breaks... take that as the minimum range for the class


#Now for each mean class, figure out how much uncertainty is in that class,
mean_class_fg2 = mean_classes[4]

true_prediction = lda.predict(mean_class_fg2.reshape(1,-1))
#print(variant_vector)
print(true_prediction)

target_features = 128

#Find the variance of each embedding parameter that keeps within the same prediction
variance_step = 0.1
n_steps = 50000


In [None]:


#2) case
#print(mean_class_fg2)
mean_class_fg2_copy = mean_class_fg2.copy()

variant_vector = np.zeros((128))
total_variance = 0

for steps in range(n_steps):
    total_variance = total_variance + variance_step 
    for each_feature in range(128):
        variant_vector[each_feature] = mean_class_fg2_copy[each_feature] - total_variance

    prediction = lda.predict(variant_vector.reshape(1,-1))

    #    mean_classes_fg_2 = mean_classes_fg_2 + np.zeros((1,128)) 
    if prediction != true_prediction:
        print(true_prediction)
        print(prediction)
        break

print(total_variance)

    #see if these correspond with the values above

lower_bound = mean_class_fg2 - total_variance + 0.1
lda.predict(lower_bound.reshape(1,-1))

print(mean_class_fg2)

In [None]:
labels = {}

for each_data in range(len(data)):

    label_key = data.iloc[each_data,133]
    colormarker_values = (data.iloc[each_data,134], data.iloc[each_data,135])

    labels[label_key] = colormarker_values   

print(labels)

# Convert the dictionary to a list of lists
list_of_labels = [[key, *values] for key, values in labels.items()]

# Now, list_of_lists contains a list of lists where each sub-list has the key and its associated values
print(list_of_labels)

coefs_labeled = np.hstack((lda.coef_,list_of_labels))


coef_savefilepath = save_filedir + 'LDAcoefslabeled.csv'
np.savetxt(coef_savefilepath,coefs_labeled,delimiter=',')


In [None]:
intercept_savefilepath = save_filedir + 'int.csv'
np.savetxt(intercept_savefilepath,lda.intercept_,delimiter=',')