In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [2]:
import os
print(os.getcwd())
os.chdir('/content/drive/My Drive/1006')
print(os.getcwd())

/content
/content/drive/My Drive/1006


In [3]:
import pandas as pd
import numpy as np
import ast
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import roc_auc_score, accuracy_score, f1_score
from sklearn.model_selection import GridSearchCV

In [4]:
dataset_name = 'eo'

In [5]:
embed_types = ['cvec_pca16', 'cvec_nmf16', 'cvec_umap16', 'cvec_tsne16', 'bert', 'roberta', 'distil', 'glove6B', 'universal']
selection_types = ['taddy', 'kmeans', 'kld', 'ks', 'cos', 'recon']
counts = [100, 200, 300, 400, 700, 1000, 1400, 1800, 2400, 3000, 3600, 4200]

## Results Dict



In [6]:
key_list = ['random_noembed', 'topics_taddy'] + [str(i)+'_'+str(j) for i in embed_types for j in selection_types]
acc_dict = {k: [] for k in key_list}
f1_dict = {k: [] for k in key_list}
roc_dict = {k: [] for k in key_list}

## Complete dataset

In [7]:
# Training
data = pd.read_csv(dataset_name+'_cvec_train.csv', index_col=0)
y_train = data['label'].to_numpy()
X_train = data.drop(columns=['label']).to_numpy()

# Evaluation
data = pd.read_csv(dataset_name+'_cvec_test.csv', index_col=0)
y_test = data['label'].to_numpy()
X_test = data.drop(columns=['label']).to_numpy()

del data

In [8]:
parameters = {'alpha':[1e-5, 1e-4, 1e-3, 1e-2, 1e-1, 1, 2, 4, 6, 8, 10, 12, 14, 16]}

gscv = GridSearchCV(MultinomialNB(), parameters, verbose=1, scoring='roc_auc')
gscv.fit(X_train, y_train)
mnb = gscv.best_estimator_
print(gscv.best_params_)
print(accuracy_score(y_test, mnb.predict(X_test)))
print(f1_score(y_test, mnb.predict(X_test)))
print(roc_auc_score(y_test, mnb.predict_proba(X_test)[:,1]))

Fitting 5 folds for each of 14 candidates, totalling 70 fits


[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done  70 out of  70 | elapsed:  1.1min finished


{'alpha': 16}
0.681042654028436
0.38313473877176907
0.6580137957158743


## Random Pick

In [9]:
indices_list = []
with open("indices_eo_random.txt") as fh: 
  lines = fh.readlines()
  for line in lines:
    indices_list.append(ast.literal_eval(line))

In [10]:
for lst in indices_list:
  gscv = GridSearchCV(MultinomialNB(), parameters, verbose=0, scoring='roc_auc')
  gscv.fit(X_train[lst], y_train[lst])
  mnb = gscv.best_estimator_
  print(gscv.best_params_)
  acc_dict['random_noembed'].append(accuracy_score(y_test, mnb.predict(X_test)))
  f1_dict['random_noembed'].append(f1_score(y_test, mnb.predict(X_test)))
  roc_dict['random_noembed'].append(roc_auc_score(y_test, mnb.predict_proba(X_test)[:,1]))

{'alpha': 0.0001}
{'alpha': 1}
{'alpha': 1}
{'alpha': 1}
{'alpha': 1}
{'alpha': 4}
{'alpha': 0.01}
{'alpha': 12}
{'alpha': 2}
{'alpha': 0.1}
{'alpha': 12}
{'alpha': 0.01}


## K-means Clustering

In [11]:
for j in range(len(embed_types)):
  indices_list = []
  with open('indices_'+dataset_name+'_'+embed_types[j]+'_kmeans.txt') as fh:
      lines = fh.readlines() 
      for line in lines:
        indices_list.append(ast.literal_eval(line))
  for lst in indices_list:
    gscv = GridSearchCV(MultinomialNB(), parameters, verbose=0, scoring='roc_auc')
    gscv.fit(X_train[lst], y_train[lst])
    mnb = gscv.best_estimator_
    acc_dict[embed_types[j]+'_kmeans'].append(accuracy_score(y_test, mnb.predict(X_test)))
    f1_dict[embed_types[j]+'_kmeans'].append(f1_score(y_test, mnb.predict(X_test)))
    roc_dict[embed_types[j]+'_kmeans'].append(roc_auc_score(y_test, mnb.predict_proba(X_test)[:,1]))

## Greedy farthest points based on KL Divergence

In [12]:
indices_list = []
for j in range(len(embed_types)):
  with open('indices_'+dataset_name+'_'+embed_types[j]+'_kld.txt') as fh:
    lines = fh.readlines()
    for line in lines:
      indices_list.append(ast.literal_eval(line))

In [13]:
for i, lst in enumerate(indices_list):
  for c in counts:
    gscv = GridSearchCV(MultinomialNB(), parameters, verbose=0, scoring='roc_auc')
    gscv.fit(X_train[lst[:c]], y_train[lst[:c]])
    mnb = gscv.best_estimator_
    acc_dict[embed_types[i]+'_kld'].append(accuracy_score(y_test, mnb.predict(X_test)))
    f1_dict[embed_types[i]+'_kld'].append(f1_score(y_test, mnb.predict(X_test)))
    roc_dict[embed_types[i]+'_kld'].append(roc_auc_score(y_test, mnb.predict_proba(X_test)[:,1]))

## Greedy farthest points based on Kolmogorov Smirnov statistics

In [14]:
indices_list = []
for j in range(len(embed_types)):
  with open('indices_'+dataset_name+'_'+embed_types[j]+'_ks.txt') as fh:
    lines = fh.readlines()
    for line in lines:
      indices_list.append(ast.literal_eval(line))

In [15]:
for i, lst in enumerate(indices_list):
  for c in counts:
    gscv = GridSearchCV(MultinomialNB(), parameters, verbose=0, scoring='roc_auc')
    gscv.fit(X_train[lst[:c]], y_train[lst[:c]])
    mnb = gscv.best_estimator_
    acc_dict[embed_types[i]+'_ks'].append(accuracy_score(y_test, mnb.predict(X_test)))
    f1_dict[embed_types[i]+'_ks'].append(f1_score(y_test, mnb.predict(X_test)))
    roc_dict[embed_types[i]+'_ks'].append(roc_auc_score(y_test, mnb.predict_proba(X_test)[:,1]))

## Greedy farthest points based on cosine distance

In [16]:
indices_list = []
for j in range(len(embed_types)):
  with open('indices_'+dataset_name+'_'+embed_types[j]+'_cos.txt') as fh:
    lines = fh.readlines()
    for line in lines:
      indices_list.append(ast.literal_eval(line))

In [17]:
for i, lst in enumerate(indices_list):
  for c in counts:
    gscv = GridSearchCV(MultinomialNB(), parameters, verbose=0, scoring='roc_auc')
    gscv.fit(X_train[lst[:c]], y_train[lst[:c]])
    mnb = gscv.best_estimator_
    acc_dict[embed_types[i]+'_cos'].append(accuracy_score(y_test, mnb.predict(X_test)))
    f1_dict[embed_types[i]+'_cos'].append(f1_score(y_test, mnb.predict(X_test)))
    roc_dict[embed_types[i]+'_cos'].append(roc_auc_score(y_test, mnb.predict_proba(X_test)[:,1]))

## Reconstruction Loss Minimization

In [18]:
indices_list = []
for j in range(len(embed_types)):
  indices_list.append(list(np.load('indices_'+dataset_name+'_'+embed_types[j]+'_recon.npy')))

In [19]:
for i, lst in enumerate(indices_list):
  for c in counts:
    gscv = GridSearchCV(MultinomialNB(), parameters, verbose=0, scoring='roc_auc')
    gscv.fit(X_train[lst[:c]], y_train[lst[:c]])
    mnb = gscv.best_estimator_
    acc_dict[embed_types[i]+'_recon'].append(accuracy_score(y_test, mnb.predict(X_test)))
    f1_dict[embed_types[i]+'_recon'].append(f1_score(y_test, mnb.predict(X_test)))
    roc_dict[embed_types[i]+'_recon'].append(roc_auc_score(y_test, mnb.predict_proba(X_test)[:,1]))

## Taddy

In [20]:
for i in range(len(embed_types)):
  indices_list = np.load('indices_'+dataset_name+'_'+embed_types[i]+'_taddy.npy', allow_pickle=True)
  for j in range(len(indices_list)):
    gscv = GridSearchCV(MultinomialNB(), parameters, verbose=0, scoring='roc_auc')
    gscv.fit(X_train[indices_list[j][0]], y_train[indices_list[j][0]])
    mnb = gscv.best_estimator_
    acc_dict[embed_types[i]+'_taddy'].append(accuracy_score(y_test, mnb.predict(X_test)))
    f1_dict[embed_types[i]+'_taddy'].append(f1_score(y_test, mnb.predict(X_test)))
    roc_dict[embed_types[i]+'_taddy'].append(roc_auc_score(y_test, mnb.predict_proba(X_test)[:,1]))

## Taddy Topics

In [21]:
for embed in ['cvec_orig_taddy']:
  indices_list = np.load('indices_'+dataset_name+'_'+embed+'.npy', allow_pickle=True)
  for j in range(len(indices_list)):
    gscv = GridSearchCV(MultinomialNB(), parameters, verbose=0, scoring='roc_auc')
    gscv.fit(X_train[indices_list[j][0]], y_train[indices_list[j][0]])
    mnb = gscv.best_estimator_
    acc_dict['topics_taddy'].append(accuracy_score(y_test, mnb.predict(X_test)))
    f1_dict['topics_taddy'].append(f1_score(y_test, mnb.predict(X_test)))
    roc_dict['topics_taddy'].append(roc_auc_score(y_test, mnb.predict_proba(X_test)[:,1]))

In [22]:
pd.DataFrame.from_dict(roc_dict, orient='index').transpose()

Unnamed: 0,random_noembed,topics_taddy,cvec_pca16_taddy,cvec_pca16_kmeans,cvec_pca16_kld,cvec_pca16_ks,cvec_pca16_cos,cvec_pca16_recon,cvec_nmf16_taddy,cvec_nmf16_kmeans,cvec_nmf16_kld,cvec_nmf16_ks,cvec_nmf16_cos,cvec_nmf16_recon,cvec_umap16_taddy,cvec_umap16_kmeans,cvec_umap16_kld,cvec_umap16_ks,cvec_umap16_cos,cvec_umap16_recon,cvec_tsne16_taddy,cvec_tsne16_kmeans,cvec_tsne16_kld,cvec_tsne16_ks,cvec_tsne16_cos,cvec_tsne16_recon,bert_taddy,bert_kmeans,bert_kld,bert_ks,bert_cos,bert_recon,roberta_taddy,roberta_kmeans,roberta_kld,roberta_ks,roberta_cos,roberta_recon,distil_taddy,distil_kmeans,distil_kld,distil_ks,distil_cos,distil_recon,glove6B_taddy,glove6B_kmeans,glove6B_kld,glove6B_ks,glove6B_cos,glove6B_recon,universal_taddy,universal_kmeans,universal_kld,universal_ks,universal_cos,universal_recon
0,0.532368,0.566192,0.588919,0.602969,0.532882,0.509939,0.561631,0.554302,0.560539,0.618151,0.589879,0.525073,0.524562,0.542499,0.439576,0.545042,0.540549,0.601013,0.597648,0.582625,0.523719,0.581142,0.522797,0.530267,0.533515,0.581746,0.485232,0.537424,0.494927,0.578604,0.617784,0.55579,0.588645,0.522387,0.577074,0.522455,0.52876,0.609838,0.584405,0.604901,0.590366,0.567606,0.512591,0.535826,0.481831,0.515426,0.542081,0.545417,0.548044,0.600765,0.584464,0.62361,0.485472,0.530613,0.529522,0.59585
1,0.525026,0.553121,0.579655,0.548362,0.539339,0.619145,0.556825,0.574234,0.562641,0.631091,0.557185,0.595352,0.501602,0.568584,0.504922,0.602009,0.549489,0.616753,0.594218,0.579252,0.545458,0.591169,0.568132,0.574725,0.574121,0.580786,0.538676,0.585506,0.512651,0.580203,0.619704,0.587505,0.561244,0.583278,0.614594,0.570365,0.544143,0.599445,0.611475,0.620301,0.590996,0.560323,0.533183,0.54501,0.567221,0.582221,0.626504,0.601791,0.58151,0.593116,0.531659,0.539103,0.515581,0.541372,0.581492,0.59779
2,0.61238,0.550523,0.577733,0.548874,0.603802,0.633893,0.583039,0.604936,0.573721,0.601627,0.565637,0.604794,0.540407,0.587373,0.552611,0.596814,0.572646,0.609095,0.588087,0.605487,0.593435,0.613525,0.574713,0.595033,0.607278,0.613127,0.54695,0.624065,0.587865,0.593611,0.606494,0.605008,0.576875,0.62885,0.612065,0.604924,0.54349,0.626098,0.537654,0.585307,0.601032,0.589933,0.539696,0.537197,0.557432,0.574978,0.641928,0.603941,0.54786,0.604662,0.582264,0.593967,0.556609,0.541389,0.588608,0.600799
3,0.615688,0.565379,0.569025,0.609726,0.567718,0.635225,0.608525,0.61117,0.569283,0.60003,0.555169,0.619077,0.546555,0.586373,0.532483,0.612109,0.576895,0.614079,0.620597,0.623203,0.593107,0.575399,0.599448,0.622616,0.608578,0.635028,0.591372,0.542357,0.603025,0.583065,0.573428,0.591422,0.605851,0.621913,0.620561,0.598091,0.582942,0.6223,0.57694,0.61428,0.595618,0.627514,0.540604,0.622448,0.53846,0.616298,0.615988,0.623198,0.557967,0.595689,0.540693,0.610167,0.571183,0.574062,0.585321,0.607079
4,0.604187,0.584996,0.577858,0.621724,0.607961,0.612661,0.622141,0.634206,0.54975,0.6243,0.602648,0.617649,0.556252,0.623492,0.593317,0.634036,0.577243,0.605037,0.640821,0.630743,0.61535,0.605874,0.621898,0.628153,0.620069,0.641431,0.61959,0.640147,0.622435,0.619533,0.635443,0.608508,0.618545,0.614587,0.62919,0.62108,0.585624,0.632572,0.623956,0.612761,0.622419,0.614006,0.584122,0.636673,0.598556,0.589164,0.63294,0.601709,0.625752,0.646194,0.620111,0.632015,0.615202,0.595676,0.603019,0.631149
5,0.634806,0.624937,0.581106,0.612481,0.631204,0.640623,0.647755,0.639593,0.575967,0.625821,0.602105,0.630171,0.607852,0.628508,0.617345,0.615247,0.607205,0.632768,0.594154,0.633991,0.609771,0.637854,0.615143,0.617771,0.621796,0.652132,0.600389,0.650259,0.625757,0.627803,0.640572,0.612566,0.568163,0.654989,0.632703,0.642885,0.604271,0.633299,0.575256,0.624583,0.596417,0.625717,0.652549,0.638195,0.585876,0.633737,0.629741,0.63597,0.630619,0.633599,0.600758,0.631299,0.629806,0.607677,0.615506,0.645064
6,0.615778,0.610362,0.612793,0.651677,0.631622,0.654369,0.640231,0.633798,0.59976,0.646159,0.616087,0.637911,0.630381,0.632277,0.625714,0.626682,0.639142,0.633817,0.619802,0.633608,0.641983,0.634763,0.643895,0.637464,0.615716,0.655358,0.658099,0.65589,0.632798,0.639675,0.648436,0.638995,0.648113,0.640383,0.641085,0.644762,0.607261,0.636728,0.633687,0.664969,0.607126,0.631567,0.623826,0.643446,0.62388,0.631674,0.653102,0.635109,0.646298,0.652343,0.618318,0.653212,0.639905,0.590202,0.625841,0.651451
7,0.625986,0.635399,0.631,0.641278,0.634462,0.650791,0.649431,0.639882,0.631834,0.647983,0.615512,0.636272,0.633904,0.645851,0.620746,0.649247,0.64079,0.647417,0.639153,0.639284,0.637096,0.634142,0.646585,0.645464,0.64587,0.657669,0.603604,0.656405,0.633029,0.656942,0.635587,0.651392,0.577874,0.646147,0.654117,0.64901,0.62483,0.647287,0.543183,0.650109,0.633166,0.636572,0.634345,0.647462,0.632164,0.626779,0.64517,0.632141,0.643532,0.643526,0.579348,0.646564,0.637302,0.612306,0.633078,0.629925
8,0.656473,0.647481,0.62951,0.654324,0.650161,0.656538,0.658173,0.644012,0.638729,0.623744,0.616326,0.63173,0.648939,0.639781,0.629288,0.642953,0.647794,0.646579,0.663672,0.64461,0.649737,0.63794,0.647982,0.648877,0.662319,0.660758,0.641863,0.652957,0.641817,0.665213,0.625768,0.6512,0.653236,0.645552,0.66085,0.647013,0.626924,0.651072,0.613466,0.664959,0.638908,0.641082,0.628979,0.66439,0.641278,0.6454,0.658417,0.644831,0.649168,0.640482,0.621589,0.653723,0.639254,0.625338,0.650784,0.645752
9,0.650752,0.663425,0.653119,0.654661,0.659481,0.652282,0.661687,0.65032,0.6532,0.638594,0.630765,0.634141,0.656625,0.635506,0.63244,0.651327,0.660319,0.643805,0.659681,0.641976,0.6425,0.651497,0.657252,0.653748,0.670778,0.659615,0.608469,0.657432,0.646541,0.657831,0.655016,0.657958,0.622199,0.662368,0.660144,0.647245,0.643988,0.65718,0.610942,0.647541,0.653092,0.65119,0.649453,0.671053,0.606116,0.636722,0.654988,0.6424,0.661726,0.641335,0.576488,0.652218,0.645101,0.637493,0.65142,0.649169


## Save dicts to csv

In [23]:
pd.DataFrame.from_dict(acc_dict, orient='index').transpose().to_csv('eo_acc_mnb.csv')
pd.DataFrame.from_dict(f1_dict, orient='index').transpose().to_csv('eo_f1_mnb.csv')
pd.DataFrame.from_dict(roc_dict, orient='index').transpose().to_csv('eo_roc_mnb.csv')