In [0]:
import pandas as pd
from collections import defaultdict 
from tqdm import tqdm
import numpy as np
import csv
from collections import OrderedDict

df = pd.read_excel('phenotypic test results.xlsx')

In [0]:
s = set()
for c in df.columns[1:]:
  s.update(pd.unique(df[c]))
s

{'I', 'IR', 'IS', 'R', 'RS', 'S', 'SI', 'SR', nan}

In [0]:
df = df.replace('I', np.nan)
df = df.replace('IR', 1)
df = df.replace('R', 1)
df = df.replace('RS', 1)
df = df.replace('IS', 0)
df = df.replace('S', 0)
df = df.replace('SI', 0)
df = df.replace('SR', 0)
df0 = pd.DataFrame(columns=df.to_numpy().T[0], index=[df.columns[1:]], data=df.to_numpy().T[1:])

In [0]:
filenames = []
for x in range(0, 24):
  filenames.append('mutations/'+str(x)+'_1.csv')

d_1 = OrderedDict()
for fname in filenames:
  reader = csv.reader(open(fname, 'r'))
  for row in reader:
      d_1[row[0]] = np.array(list(map(int, row[1:])))

In [0]:
l_1 = set()
for k in d_1:
  l_1.update(list(d_1[k]))
dict_number_mutation = dict(zip(range(0, len(l_1)), sorted(l_1)))
dict_mutation_number = dict(zip(sorted(l_1), range(0, len(l_1))))
array_1 = np.zeros((len(d_1), len(l_1)))

for i, k in enumerate(d_1):
  for j in d_1[k]:
    array_1[i, dict_mutation_number[j]] = 1

X = pd.DataFrame(data=array_1, index=d_1.keys(), columns=l_1)

In [0]:
u, c = np.unique(df0.columns, return_counts=True)
dup = u[c > 1]
df_dulicated = df0[dup]
df0 = df0.drop(columns=dup)

In [0]:
for x in dup:
  l1 = df_dulicated[x].to_numpy(dtype=np.float64)[:, 0]
  l2 = df_dulicated[x].to_numpy(dtype=np.float64)[:, 1]
  if sum(np.isnan(l1)) > sum(np.isnan(l2)):
    df0[x] = l2
  else:
    df0[x] = l1
y = df0[d_1.keys()]

In [0]:
from sklearn.decomposition import PCA, NMF
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier, AdaBoostClassifier
from sklearn.svm import SVC, LinearSVC
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, confusion_matrix, roc_auc_score, f1_score
from sklearn.model_selection import cross_validate
from statistics import mean 
import warnings
warnings.filterwarnings('ignore')


df_results = pd.DataFrame(columns=['classifier', 'PCA', 'NMF', 'Accuracy', 'Recall', 'Precision', 'AUC', 'F1-Score', 'drug'])

for clf, name in zip([LogisticRegression(penalty='l1', solver='liblinear'), LogisticRegression(penalty='l2'), LinearSVC(), SVC(),\
                      AdaBoostClassifier(n_estimators=10), GradientBoostingClassifier(max_features='sqrt'), RandomForestClassifier()],\
            ['LR-L1','LR-L2', 'LinearSVC', 'SVC', 'AdaBoost', 'GB', 'RF']):
    print(name)
    for drug in tqdm(df0.index.values):
        y_new = df0.loc[drug].dropna()
        X_new = X.loc[y_new.index].to_numpy()
        y_new = y_new.to_numpy().astype('int')
        results = cross_validate(clf, X_new, y_new, scoring=['accuracy', 'roc_auc', 'f1', 'recall', 'precision'])
        acc = mean(results['test_accuracy'])
        r = mean(results['test_recall'])
        p = mean(results['test_precision'])
        f1 = mean(results['test_f1'])
        auc = mean(results['test_roc_auc'])
        df_results = df_results.append(pd.Series([name, None, None, acc, r, p, auc, f1, drug[0]], \
                                                 index=df_results.columns ), ignore_index=True)












  0%|          | 0/9 [00:00<?, ?it/s][A[A[A[A[A[A[A[A[A[A[A

LR-L1













 11%|█         | 1/9 [00:02<00:18,  2.31s/it][A[A[A[A[A[A[A[A[A[A[A










 22%|██▏       | 2/9 [00:04<00:16,  2.36s/it][A[A[A[A[A[A[A[A[A[A[A










 33%|███▎      | 3/9 [00:07<00:14,  2.35s/it][A[A[A[A[A[A[A[A[A[A[A










 44%|████▍     | 4/9 [00:07<00:09,  1.82s/it][A[A[A[A[A[A[A[A[A[A[A










 56%|█████▌    | 5/9 [00:09<00:07,  1.93s/it][A[A[A[A[A[A[A[A[A[A[A










 67%|██████▋   | 6/9 [00:11<00:05,  1.81s/it][A[A[A[A[A[A[A[A[A[A[A










 78%|███████▊  | 7/9 [00:12<00:03,  1.72s/it][A[A[A[A[A[A[A[A[A[A[A










 89%|████████▉ | 8/9 [00:15<00:01,  1.84s/it][A[A[A[A[A[A[A[A[A[A[A










100%|██████████| 9/9 [00:17<00:00,  1.89s/it]











  0%|          | 0/9 [00:00<?, ?it/s][A[A[A[A[A[A[A[A[A[A[A

LR-L2













 11%|█         | 1/9 [00:19<02:35, 19.41s/it][A[A[A[A[A[A[A[A[A[A[A










 22%|██▏       | 2/9 [00:38<02:15, 19.29s/it][A[A[A[A[A[A[A[A[A[A[A










 33%|███▎      | 3/9 [00:58<01:56, 19.39s/it][A[A[A[A[A[A[A[A[A[A[A










 44%|████▍     | 4/9 [01:03<01:15, 15.17s/it][A[A[A[A[A[A[A[A[A[A[A










 56%|█████▌    | 5/9 [01:22<01:04, 16.24s/it][A[A[A[A[A[A[A[A[A[A[A










 67%|██████▋   | 6/9 [01:34<00:44, 14.98s/it][A[A[A[A[A[A[A[A[A[A[A










 78%|███████▊  | 7/9 [01:46<00:28, 14.29s/it][A[A[A[A[A[A[A[A[A[A[A










 89%|████████▉ | 8/9 [02:03<00:15, 15.05s/it][A[A[A[A[A[A[A[A[A[A[A










100%|██████████| 9/9 [02:19<00:00, 15.49s/it]











  0%|          | 0/9 [00:00<?, ?it/s][A[A[A[A[A[A[A[A[A[A[A

LinearSVC













 11%|█         | 1/9 [00:11<01:29, 11.18s/it][A[A[A[A[A[A[A[A[A[A[A










 22%|██▏       | 2/9 [00:29<01:32, 13.21s/it][A[A[A[A[A[A[A[A[A[A[A










 33%|███▎      | 3/9 [00:41<01:16, 12.83s/it][A[A[A[A[A[A[A[A[A[A[A










 44%|████▍     | 4/9 [00:43<00:49,  9.85s/it][A[A[A[A[A[A[A[A[A[A[A










 56%|█████▌    | 5/9 [00:55<00:41, 10.33s/it][A[A[A[A[A[A[A[A[A[A[A










 67%|██████▋   | 6/9 [01:06<00:31, 10.48s/it][A[A[A[A[A[A[A[A[A[A[A










 78%|███████▊  | 7/9 [01:16<00:21, 10.52s/it][A[A[A[A[A[A[A[A[A[A[A










 89%|████████▉ | 8/9 [01:32<00:11, 11.91s/it][A[A[A[A[A[A[A[A[A[A[A










100%|██████████| 9/9 [01:48<00:00, 12.01s/it]











  0%|          | 0/9 [00:00<?, ?it/s][A[A[A[A[A[A[A[A[A[A[A

SVC













 11%|█         | 1/9 [02:29<19:56, 149.57s/it][A[A[A[A[A[A[A[A[A[A[A










 22%|██▏       | 2/9 [06:09<19:54, 170.69s/it][A[A[A[A[A[A[A[A[A[A[A










 33%|███▎      | 3/9 [08:47<16:41, 166.91s/it][A[A[A[A[A[A[A[A[A[A[A










 44%|████▍     | 4/9 [08:54<09:54, 118.88s/it][A[A[A[A[A[A[A[A[A[A[A










 56%|█████▌    | 5/9 [11:13<08:19, 124.82s/it][A[A[A[A[A[A[A[A[A[A[A










 67%|██████▋   | 6/9 [12:35<05:36, 112.06s/it][A[A[A[A[A[A[A[A[A[A[A










 78%|███████▊  | 7/9 [13:54<03:24, 102.26s/it][A[A[A[A[A[A[A[A[A[A[A










 89%|████████▉ | 8/9 [16:27<01:57, 117.42s/it][A[A[A[A[A[A[A[A[A[A[A










100%|██████████| 9/9 [19:10<00:00, 127.86s/it]











  0%|          | 0/9 [00:00<?, ?it/s][A[A[A[A[A[A[A[A[A[A[A

AdaBoost













 11%|█         | 1/9 [00:32<04:23, 32.96s/it][A[A[A[A[A[A[A[A[A[A[A










 22%|██▏       | 2/9 [01:04<03:48, 32.62s/it][A[A[A[A[A[A[A[A[A[A[A










 33%|███▎      | 3/9 [01:37<03:16, 32.78s/it][A[A[A[A[A[A[A[A[A[A[A










 44%|████▍     | 4/9 [01:42<02:02, 24.45s/it][A[A[A[A[A[A[A[A[A[A[A










 56%|█████▌    | 5/9 [02:14<01:46, 26.65s/it][A[A[A[A[A[A[A[A[A[A[A










 67%|██████▋   | 6/9 [02:33<01:13, 24.34s/it][A[A[A[A[A[A[A[A[A[A[A










 78%|███████▊  | 7/9 [02:51<00:45, 22.50s/it][A[A[A[A[A[A[A[A[A[A[A










 89%|████████▉ | 8/9 [03:18<00:23, 23.60s/it][A[A[A[A[A[A[A[A[A[A[A










100%|██████████| 9/9 [03:42<00:00, 24.74s/it]











  0%|          | 0/9 [00:00<?, ?it/s][A[A[A[A[A[A[A[A[A[A[A

GB













 11%|█         | 1/9 [00:05<00:41,  5.21s/it][A[A[A[A[A[A[A[A[A[A[A










 22%|██▏       | 2/9 [00:10<00:36,  5.15s/it][A[A[A[A[A[A[A[A[A[A[A










 33%|███▎      | 3/9 [00:15<00:30,  5.16s/it][A[A[A[A[A[A[A[A[A[A[A










 44%|████▍     | 4/9 [00:16<00:19,  3.97s/it][A[A[A[A[A[A[A[A[A[A[A










 56%|█████▌    | 5/9 [00:21<00:17,  4.30s/it][A[A[A[A[A[A[A[A[A[A[A










 67%|██████▋   | 6/9 [00:24<00:11,  3.98s/it][A[A[A[A[A[A[A[A[A[A[A










 78%|███████▊  | 7/9 [00:27<00:07,  3.69s/it][A[A[A[A[A[A[A[A[A[A[A










 89%|████████▉ | 8/9 [00:32<00:03,  3.83s/it][A[A[A[A[A[A[A[A[A[A[A










100%|██████████| 9/9 [00:35<00:00,  4.00s/it]











  0%|          | 0/9 [00:00<?, ?it/s][A[A[A[A[A[A[A[A[A[A[A

RF













 11%|█         | 1/9 [00:15<02:01, 15.13s/it][A[A[A[A[A[A[A[A[A[A[A










 22%|██▏       | 2/9 [00:30<01:45, 15.08s/it][A[A[A[A[A[A[A[A[A[A[A










 33%|███▎      | 3/9 [00:46<01:32, 15.39s/it][A[A[A[A[A[A[A[A[A[A[A










 44%|████▍     | 4/9 [00:48<00:57, 11.47s/it][A[A[A[A[A[A[A[A[A[A[A










 56%|█████▌    | 5/9 [01:02<00:48, 12.20s/it][A[A[A[A[A[A[A[A[A[A[A










 67%|██████▋   | 6/9 [01:11<00:33, 11.12s/it][A[A[A[A[A[A[A[A[A[A[A










 78%|███████▊  | 7/9 [01:19<00:20, 10.47s/it][A[A[A[A[A[A[A[A[A[A[A










 89%|████████▉ | 8/9 [01:34<00:11, 11.64s/it][A[A[A[A[A[A[A[A[A[A[A










100%|██████████| 9/9 [01:47<00:00, 11.94s/it]


In [0]:
df_results.to_excel('results.xlsx')

In [0]:
warnings.filterwarnings('always')

df_results_PCA = pd.DataFrame(columns=['classifier', 'PCA', 'NMF', 'Accuracy', 'Recall', 'Precision', 'AUC', 'F1-Score', 'drug'])

for clf, name in zip([LogisticRegression(penalty='l1', solver='liblinear'), LogisticRegression(penalty='l2'), LinearSVC(), SVC(),\
                      AdaBoostClassifier(n_estimators=10), GradientBoostingClassifier(max_features='sqrt'), RandomForestClassifier()],
            ['LR-L1', 'LR-L2', 'LinearSVC', 'SVC', 'AdaBoost', 'GB', 'RF']):
    for drug in tqdm(df0.index.values):
      for n in [50, 100, 150]:
        y_new = df0.loc[drug].dropna()
        X_new = PCA(n_components=n).fit_transform(X.loc[y_new.index].to_numpy())
        y_new = y_new.to_numpy().astype('int')
        results = cross_validate(clf, X_new, y_new, scoring=['accuracy', 'roc_auc', 'f1', 'recall', 'precision'])
        acc = mean(results['test_accuracy'])
        r = mean(results['test_recall'])
        p = mean(results['test_precision'])
        f1 = mean(results['test_f1'])
        auc = mean(results['test_roc_auc'])
        df_results_PCA = df_results_PCA.append(pd.Series([name, n, None, acc, r, p, auc, f1, drug[0]], \
                                                 index=df_results_PCA.columns ), ignore_index=True)












  0%|          | 0/9 [00:00<?, ?it/s][A[A[A[A[A[A[A[A[A[A[A










 11%|█         | 1/9 [00:15<02:02, 15.31s/it][A[A[A[A[A[A[A[A[A[A[A










 22%|██▏       | 2/9 [00:28<01:43, 14.74s/it][A[A[A[A[A[A[A[A[A[A[A










 33%|███▎      | 3/9 [00:42<01:27, 14.57s/it][A[A[A[A[A[A[A[A[A[A[A










 44%|████▍     | 4/9 [00:46<00:56, 11.20s/it][A[A[A[A[A[A[A[A[A[A[A










 56%|█████▌    | 5/9 [00:59<00:47, 11.84s/it][A[A[A[A[A[A[A[A[A[A[A










 67%|██████▋   | 6/9 [01:07<00:31, 10.65s/it][A[A[A[A[A[A[A[A[A[A[A










 78%|███████▊  | 7/9 [01:15<00:19,  9.94s/it][A[A[A[A[A[A[A[A[A[A[A










 89%|████████▉ | 8/9 [01:26<00:10, 10.11s/it][A[A[A[A[A[A[A[A[A[A[A










100%|██████████| 9/9 [01:36<00:00, 10.67s/it]











STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://sciki

In [0]:
df_results_PCA.to_excel('results_PCA.xlsx')

In [0]:
df_results_NMF = pd.DataFrame(columns=['classifier', 'PCA', 'NMF', 'Accuracy', 'Recall', 'Precision', 'AUC', 'F1-Score', 'drug'])

for clf, name in zip([RandomForestClassifier()], ['RF']):
    for drug in tqdm(df0.index.values):
      for n in [50, 100, 150]:
        y_new = df0.loc[drug].dropna()
        X_new = NMF(n_components=n).fit_transform(X.loc[y_new.index].to_numpy())
        y_new = y_new.to_numpy().astype('int')
        results = cross_validate(clf, X_new, y_new, scoring=['accuracy', 'roc_auc', 'f1', 'recall', 'precision'])
        acc = mean(results['test_accuracy'])
        r = mean(results['test_recall'])
        p = mean(results['test_precision'])
        f1 = mean(results['test_f1'])
        auc = mean(results['test_roc_auc'])
        df_results_NMF = df_results_NMF.append(pd.Series([name, None, n, acc, r, p, auc, f1, drug[0]], \
                                                 index=df_results_NMF.columns ), ignore_index=True)















  0%|          | 0/9 [00:00<?, ?it/s][A[A[A[A[A[A[A[A[A[A[A[A[A[A













 11%|█         | 1/9 [12:00<1:36:02, 720.32s/it][A[A[A[A[A[A[A[A[A[A[A[A[A[A













 22%|██▏       | 2/9 [23:58<1:23:57, 719.61s/it][A[A[A[A[A[A[A[A[A[A[A[A[A[A

In [0]:
df_results_NMF.to_excel('results_NMF.xlsx')