In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import confusion_matrix
from sklearn.metrics import f1_score
from sklearn.metrics import accuracy_score

In [2]:
dataset = pd.read_csv('drug_embedding.csv')
print(len(dataset))
print(dataset.head())

300
   Methoxamine  pazopanib   Betaine  ioversol  Moricizine  Tetrofosmin  \
0     0.017247   0.007961 -0.063701 -0.043919    0.024263     0.040614   
1    -0.041623   0.075804 -0.334263  0.095424    0.000796     0.011437   
2     0.190152  -0.126302  0.005715 -0.080937   -0.203299     0.026808   
3     0.157794   0.021697 -0.237033 -0.058357   -0.044682    -0.007127   
4     0.221838  -0.000749 -0.570003 -0.168058   -0.049307    -0.046850   

   Ticlopidine  glimepiride  Iothalamic Acid  Buprenorphine  ...  Ethamsylate  \
0     0.025450    -0.013580        -0.012309      -0.004083  ...    -0.016790   
1    -0.021015     0.006967        -0.197068       0.049864  ...    -0.112515   
2    -0.021011     0.122890        -0.152093       0.029875  ...     0.225788   
3     0.044908    -0.001245        -0.011849       0.176401  ...    -0.006181   
4     0.250354    -0.185083         0.061384      -0.007641  ...     0.062186   

   Nafronyl  Thioctate  potassium bromide  Carbomer 1342   Propa

In [3]:
dataset.columns

Index(['Methoxamine', 'pazopanib', 'Betaine', 'ioversol', 'Moricizine',
       'Tetrofosmin', 'Ticlopidine', 'glimepiride', 'Iothalamic Acid',
       'Buprenorphine',
       ...
       'Ethamsylate', 'Nafronyl', 'Thioctate', 'potassium bromide',
       'Carbomer 1342', 'Propane', 'Vitamin B Complex',
       'Sodium Phosphate, Dibasic', 'triazulenone', 'Dextran 70'],
      dtype='object', length=2213)

In [4]:
X = dataset.iloc[:, 10].values.reshape(-1, 1)  # Features
Y = dataset.iloc[:, 8].values  # Labels
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, random_state=0, test_size=0.2)


scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)


Y_train_categorical = pd.cut(Y_train, bins=5, labels=False)
Y_test_categorical = pd.cut(Y_test, bins=5, labels=False)

# k-NN classifier
classifier = KNeighborsClassifier(n_neighbors=15, p=2, metric='euclidean')
classifier.fit(X_train_scaled, Y_train_categorical)

y_pred_categorical = classifier.predict(X_test_scaled)

In [6]:
conf = confusion_matrix(Y_test_categorical, y_pred_categorical)
print('confusion matrix', conf)

confusion matrix [[ 0  0  1  0  0]
 [ 0  1  8  0  0]
 [ 0  1 35  0  0]
 [ 0  1 11  0  0]
 [ 0  0  2  0  0]]


In [7]:
f1 = f1_score(Y_test_categorical, y_pred_categorical, average='weighted') 
print("F1 Score:", f1)

F1 Score: 0.47661290322580646


In [8]:
accuracy = accuracy_score(Y_test_categorical, y_pred_categorical)
print("Accuracy Score:", accuracy)

Accuracy Score: 0.6
