# KNN Algorithm for all the data

In [2]:
import pandas as pd

cardiovascular_data = pd.read_csv("C:/Users/sshor/Desktop/Softwarepraktikum2024/ctd_cadiovascular_20240223_scores.tsv", sep='\t')
neoplasm_data = pd.read_csv("C:/Users/sshor/Desktop/Softwarepraktikum2024/ctd_neoplasm_20240220_scores.tsv", sep='\t')
nervous_system_data = pd.read_csv("C:/Users/sshor/Desktop/Softwarepraktikum2024/ctd_nervous-system_20240223_scores.tsv", sep='\t')
hypertension_data = pd.read_csv("C:/Users/sshor/Desktop/Softwarepraktikum2024/ctd_hypertension_20240223_scores.tsv", sep='\t')

ground_truth_cardiovascular = pd.read_csv("C:/Users/sshor/Desktop/Softwarepraktikum2024/Softwarepraktikum/merged_cardio.tsv", sep='\t')
ground_truth_neoplasm = pd.read_csv("C:/Users/sshor/Desktop/Softwarepraktikum2024/Softwarepraktikum/merged_Neoplasm.tsv", sep='\t')
ground_truth_nervous_system = pd.read_csv("C:/Users/sshor/Desktop/Softwarepraktikum2024/Softwarepraktikum/merged_nerveSys.tsv", sep='\t')
ground_truth_hypertension = pd.read_csv("C:/Users/sshor/Desktop/Softwarepraktikum2024/Softwarepraktikum/merged_hyperhyper.tsv", sep='\t')

In [3]:
print("Cardiovascular Data:")
print(cardiovascular_data.head())

# checking dimensions
print("\nDimensions of Cardiovascular Data:", cardiovascular_data.shape)

# summary statistics for numeric features
print("\nSummary Statistics for Cardiovascular Data:")
print(cardiovascular_data.describe())

print("\nNeoplasm Data:")
print(neoplasm_data.head())

print("\nDimensions of Neoplasm Data:", neoplasm_data.shape)

print("\nSummary Statistics for Neoplasm Data:")
print(neoplasm_data.describe())



Cardiovascular Data:
     drugA    drugB     sAB  opAB  meanspAB  medianspAB  minspAB  maxspAB  \
0  DB06709  DB08506  0.3427     0     0.615       0.630     0.46     0.74   
1  DB06709  DB02376  0.4188     0     0.685       0.685     0.56     0.81   
2  DB06709  DB04282  0.4188     0     0.685       0.685     0.56     0.81   
3  DB06709  DB04564  0.4188     0     0.685       0.685     0.56     0.81   
4  DB06709  DB04659  0.4188     0     0.685       0.685     0.56     0.81   

     zTDA    zTDB  ...  opAD  opBD  meanspAD  meanspBD  medianspAD  \
0  0.5097  1.3545  ...   2.0   0.0    0.5983    0.5773        0.59   
1  0.5097 -1.3890  ...   2.0   0.0    0.5983    0.5969        0.59   
2  0.5097 -1.3890  ...   2.0   0.0    0.5983    0.5969        0.59   
3  0.5097 -1.3890  ...   2.0   0.0    0.5983    0.5969        0.59   
4  0.5097 -1.3890  ...   2.0   0.0    0.5983    0.5969        0.59   

   medianspBD  minspAD  minspBD  maxspAD  maxspBD  
0        0.56      0.0     0.28     1.47   

In [4]:
merged_cardiovascular = pd.merge(cardiovascular_data, ground_truth_cardiovascular, on=['drugA', 'drugB'])
merged_neoplasm = pd.merge(neoplasm_data, ground_truth_neoplasm, on=['drugA', 'drugB'])
merged_nervous_system = pd.merge(nervous_system_data, ground_truth_nervous_system, on=['drugA', 'drugB'])
merged_hypertension = pd.merge(hypertension_data, ground_truth_hypertension, on=['drugA', 'drugB'])

In [5]:
#merging into 1 single list
merged_datasets = [merged_cardiovascular, merged_neoplasm, merged_nervous_system, merged_hypertension]


In [6]:
print(cardiovascular_data.columns)
print(ground_truth_cardiovascular.columns)


Index(['drugA', 'drugB', 'sAB', 'opAB', 'meanspAB', 'medianspAB', 'minspAB',
       'maxspAB', 'zTDA', 'zTDB', 'zDTA', 'zDTB', 'sA', 'sB', 'opA', 'opB',
       'sAD', 'sBD', 'opAD', 'opBD', 'meanspAD', 'meanspBD', 'medianspAD',
       'medianspBD', 'minspAD', 'minspBD', 'maxspAD', 'maxspBD'],
      dtype='object')
Index(['drugA', 'drugB', 'drugcomb', 'adv/app'], dtype='object')


In [7]:
#defining types of distances
import numpy as np

def euclidean_distance(X, Y):
    norm_X = np.sum(np.square(X), axis=1)
    norm_Y = np.sum(np.square(Y), axis=1)
    return np.sqrt(norm_Y - 2 * X @ Y.T + np.reshape(norm_X, (X.shape[0], 1)))


def cosine_distance(X, Y):
    norm_X = np.sqrt(np.sum(np.square(X), axis=1))
    norm_X = np.where(norm_X != 0, norm_X, np.nan)
    norm_Y = np.sqrt(np.sum(np.square(Y), axis=1))
    norm_Y = np.where(norm_Y != 0, norm_Y, np.nan)
    res = np.divide(X@Y.T, norm_Y)
    res = np.divide(res, np.reshape(norm_X, (X.shape[0], 1)))
    res = np.where(np.isnan(res), 0, 1 - res)
    return res

In [10]:
from sklearn.base import BaseEstimator, ClassifierMixin
from sklearn.neighbors import NearestNeighbors

#custom KNNClassifier class
class KNNClassifier(BaseEstimator, ClassifierMixin):
    def __init__(self, k=5, weights='uniform', metric='euclidean'):
        self.k = k
        self.weights = weights
        self.metric = metric
        self.nn_model = None

    def fit(self, X, y):
        self.X = X
        self.y = y
        if self.metric == 'euclidean':
            distance_func = euclidean_distance
        elif self.metric == 'cosine':
            distance_func = cosine_distance
        else:
            raise ValueError("Invalid distance metric. Supported metrics: 'euclidean', 'cosine'.")

        self.nn_model = NearestNeighbors(n_neighbors=self.k, algorithm='brute', metric=distance_func)
        self.nn_model.fit(X)

    def predict(self, X):
        distances, indices = self.nn_model.kneighbors(X)
        if self.weights == 'uniform':
            weights = np.ones_like(distances)
        elif self.weights == 'distance':
            weights = 1.0 / distances
        else:
            raise ValueError("Invalid weight scheme. Supported weights: 'uniform', 'distance'.")

        # Weighted majority voting
        class_counts = np.zeros((X.shape[0], len(np.unique(self.y))))
        for i in range(self.k):
            class_counts += np.eye(len(np.unique(self.y)))[self.y[indices[:, i]]]

        # Get the index of the class with the maximum count
        predicted_classes = np.argmax(class_counts, axis=1)
        return predicted_classes



In [13]:
import numpy as np

def kfold(n, n_folds):
    res = []
    indexes = np.arange(n)
    folds = np.array(np.array_split(indexes, n_folds))
    for num in range(n_folds):
        arr_test = np.array(folds[num])
        elements = np.arange(n)
        arr_train = elements[np.isin(elements, arr_test, invert=True)]
        res.append((arr_train, arr_test))
    return res


In [16]:
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.base import clone

# Step 1: Splitting the dataset into training and testing 
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.2, random_state=42)

# Step 2: k-fold cross-validation
n_folds = 5
splits = kfold(len(X_train), n_folds)

# Step 3: Train and evaluate the model on each fold
model = KNNClassifier(k=5, weights='uniform', metric='euclidean')
scores = []

for train_index, val_index in splits:
    X_train_fold, X_val_fold = X_train.iloc[train_index], X_train.iloc[val_index]
    Y_train_fold, Y_val_fold = Y_train.iloc[train_index], Y_train.iloc[val_index]
    
    # Clone the model to avoid modifying the original instance
    cloned_model = clone(model)
    cloned_model.fit(X_train_fold, Y_train_fold)
    
    #  predictions on the validation set
    Y_val_pred = cloned_model.predict(X_val_fold)
    
    # Evaluate the accuracy of the model
    accuracy = accuracy_score(Y_val_fold, Y_val_pred)
    scores.append(accuracy)

# Step 4: Select the best model and evaluate it on the testing set
best_model_index = np.argmax(scores)
best_model = clone(model)
best_model.fit(X_train, Y_train)
Y_test_pred = best_model.predict(X_test)
test_accuracy = accuracy_score(Y_test, Y_test_pred)
print("Test Accuracy:", test_accuracy)


ValueError: could not convert string to float: 'DB01137'