In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.neighbors import KNeighborsClassifier, KNeighborsRegressor
from sklearn.metrics import confusion_matrix, f1_score, accuracy_score, mean_squared_error
from scipy.stats import spearmanr


f1_scores = []
accuracies = []
mses = []
sccs = []


dataset1 = pd.read_csv('ccl_feature_original.csv')
X1 = dataset1.iloc[:, 10].values.reshape(-1, 1)  
Y1 = dataset1.iloc[:, 8].values 
X_train1, X_test1, Y_train1, Y_test1 = train_test_split(X1, Y1, random_state=0, test_size=0.2)

scaler1 = StandardScaler()
X_train_scaled1 = scaler1.fit_transform(X_train1)
X_test_scaled1 = scaler1.transform(X_test1)

Y_train_categorical1 = pd.cut(Y_train1, bins=5, labels=False)
Y_test_categorical1 = pd.cut(Y_test1, bins=5, labels=False)

classifier1 = KNeighborsClassifier(n_neighbors=15, p=2, metric='euclidean')
classifier1.fit(X_train_scaled1, Y_train_categorical1)

y_pred_categorical1 = classifier1.predict(X_test_scaled1)
conf1 = confusion_matrix(Y_test_categorical1, y_pred_categorical1)
f1_1 = f1_score(Y_test_categorical1, y_pred_categorical1, average='weighted')
accuracy1 = accuracy_score(Y_test_categorical1, y_pred_categorical1)

regressor1 = KNeighborsRegressor(n_neighbors=15, p=2, metric='euclidean')
regressor1.fit(X_train_scaled1, Y_train1)

y_pred1 = regressor1.predict(X_test_scaled1)
mse1 = mean_squared_error(Y_test1, y_pred1)
scc1, _ = spearmanr(Y_test1, y_pred1)

f1_scores.append(f1_1)
accuracies.append(accuracy1)
mses.append(mse1)
sccs.append(scc1)


dataset2 = pd.read_csv('ccl_feature.csv')
X2 = dataset2.iloc[:, 10].values.reshape(-1, 1)  
Y2 = dataset2.iloc[:, 8].values 
X_train2, X_test2, Y_train2, Y_test2 = train_test_split(X2, Y2, random_state=0, test_size=0.2)

scaler2 = StandardScaler()
X_train_scaled2 = scaler2.fit_transform(X_train2)
X_test_scaled2 = scaler2.transform(X_test2)

Y_train_categorical2 = pd.cut(Y_train2, bins=5, labels=False)
Y_test_categorical2 = pd.cut(Y_test2, bins=5, labels=False)

classifier2 = KNeighborsClassifier(n_neighbors=15, p=2, metric='euclidean')
classifier2.fit(X_train_scaled2, Y_train_categorical2)

y_pred_categorical2 = classifier2.predict(X_test_scaled2)
conf2 = confusion_matrix(Y_test_categorical2, y_pred_categorical2)
f1_2 = f1_score(Y_test_categorical2, y_pred_categorical2, average='weighted')
accuracy2 = accuracy_score(Y_test_categorical2, y_pred_categorical2)

regressor2 = KNeighborsRegressor(n_neighbors=15, p=2, metric='euclidean')
regressor2.fit(X_train_scaled2, Y_train2)

y_pred2 = regressor2.predict(X_test_scaled2)
mse2 = mean_squared_error(Y_test2, y_pred2)
scc2, _ = spearmanr(Y_test2, y_pred2)

f1_scores.append(f1_2)
accuracies.append(accuracy2)
mses.append(mse2)
sccs.append(scc2)


dataset3 = pd.read_csv('drug_embedding.csv')
X3 = dataset3.iloc[:, 10].values.reshape(-1, 1)  # Features
Y3 = dataset3.iloc[:, 8].values  # Labels
X_train3, X_test3, Y_train3, Y_test3 = train_test_split(X3, Y3, random_state=0, test_size=0.2)

scaler3 = StandardScaler()
X_train_scaled3 = scaler3.fit_transform(X_train3)
X_test_scaled3 = scaler3.transform(X_test3)

Y_train_categorical3 = pd.cut(Y_train3, bins=5, labels=False)
Y_test_categorical3 = pd.cut(Y_test3, bins=5, labels=False)

classifier3 = KNeighborsClassifier(n_neighbors=15, p=2, metric='euclidean')
classifier3.fit(X_train_scaled3, Y_train_categorical3)

y_pred_categorical3 = classifier3.predict(X_test_scaled3)
conf3 = confusion_matrix(Y_test_categorical3, y_pred_categorical3)
f1_3 = f1_score(Y_test_categorical3, y_pred_categorical3, average='weighted')
accuracy3 = accuracy_score(Y_test_categorical3, y_pred_categorical3)

regressor3 = KNeighborsRegressor(n_neighbors=15, p=2, metric='euclidean')
regressor3.fit(X_train_scaled3, Y_train3)

y_pred3 = regressor3.predict(X_test_scaled3)
mse3 = mean_squared_error(Y_test3, y_pred3)
scc3, _ = spearmanr(Y_test3, y_pred3)

f1_scores.append(f1_3)
accuracies.append(accuracy3)
mses.append(mse3)
sccs.append(scc3)


f1 = np.mean(f1_scores)
std = np.std(f1_scores)
accuracy = np.mean(accuracies)
mse = np.mean(mses)
scc = np.mean(sccs)

print("F1 Score:", f1)
print("Standard Deviation :", std)
print("Accuracy Score:", accuracy)
print("MSE:", mse)
print("SCC:", scc)


F1 Score: 0.47503431828714726
Standard Deviation : 0.3731182702162206
Accuracy Score: 0.5274379884471627
MSE: 7089.948769195536
SCC: 0.29541173673652166


In [5]:
from sklearn.feature_selection import mutual_info_classif, mutual_info_regression

# Concatenate features and target variables from all datasets
X_combined = np.concatenate((X1, X2, X3), axis=0)
Y_combined = np.concatenate((Y1, Y2, Y3), axis=0)

# Define a function to calculate information gain
def calculate_information_gain(X, y):
    # Check if the target variable y is categorical or continuous
    if len(np.unique(y)) > 5:  # If the number of unique values in y is greater than 5, it's a regression task
        return mutual_info_regression(X, y)
    else:  # Otherwise, it's a classification task
        return mutual_info_classif(X, y)

# Calculate information gain for the combined dataset
information_gain_combined = calculate_information_gain(X_combined, Y_combined)

# Display information gain scores for each feature
print("Information Gain:")
for i, info_gain in enumerate(information_gain_combined):
    print(f"Feature {i+1}: {info_gain}")


Information Gain:
Feature 1: 0.7420273736041478
