In [71]:
import argparse
from small_text.classifiers import ConfidenceEnhancedLinearSVC, SklearnClassifier
from small_text.data import SklearnDataset
from small_text.classifiers.factories import SklearnClassifierFactory
from preprocess import data_loader, preprocess_data_sklearn_train, preprocess_data_sklearn_test, df_to_dict
from learner_functions import run_multiple_experiments
import random
import json
import numpy as np
from sklearn.metrics import f1_score


In [65]:
def parse_args():
    parser=argparse.ArgumentParser(description="Supervised Learning Experiment Runner with SkLearn Integration")
    parser.add_argument('--method', type = str, metavar ="", default = 'SL', help="Supervised == SL or Active == AL")
    parser.add_argument('--framework', type = str, metavar ="", default = 'SK', help="Transformers == TF or SkLearn == SK")
    parser.add_argument('--datadir', type = str, metavar ="",default = './data/', help="Path to directory with data files")
    parser.add_argument('--dataset', type = str, metavar ="",default = 'wiki', help="Name of dataset")
    parser.add_argument('--outdir', type = str, metavar ="",default = './results/', help="Path to output directory for storing results")
    parser.add_argument('--sklearn_model', type = str, metavar ="",default = 'ConfidenceEnhancedLinearSVC', help="Name of SkLearn model")
    parser.add_argument('--class_imbalance', type = int, metavar ="", default = 50, help = 'Class imbalance desired in train dataset')
    parser.add_argument('--train_n', type = int, metavar ="", default = 20000, help = 'Total number of training examples')
    parser.add_argument('--test_n', type = int, metavar ="", default = 5000, help = 'Total number of testing examples')
    parser.add_argument('--run_n', type = int, metavar ="", default = 5, help = 'Number of times to run each model')
    args=parser.parse_args()
    print("the inputs are:")
    for arg in vars(args):
        print("{} is {}".format(arg, getattr(args, arg)))
    return args

In [66]:
args=parse_args()
train_df, test_dfs = data_loader(args)

the inputs are:
method is SL
framework is /Users/raymond/Library/Jupyter/runtime/kernel-v2-74393Lk9O4Bxpvbp.json
datadir is ./data/
dataset is wiki
outdir is ./results/
sklearn_model is ConfidenceEnhancedLinearSVC
class_imbalance is 50
train_n is 20000
test_n is 5000
run_n is 5


In [67]:
model = ConfidenceEnhancedLinearSVC()
num_classes = 2
clf = SklearnClassifier(model, num_classes)

In [68]:
train_dict = df_to_dict('train', train_df)
# print(train_df.head())
train, vectorizer = preprocess_data_sklearn_train(train_dict['data'],
                                                      train_dict['target'],
                                                      train_dict['weak_target'])
test_sets = {}
matching_indexes = {}
for j in test_dfs.keys():
    matching_indexes[j] = test_dfs[j].index.tolist()
    data_dict = df_to_dict('test', test_dfs[j])
    processed_data = preprocess_data_sklearn_test(data_dict['data'],
                                                    data_dict['target'],
                                                    vectorizer)
    test_sets[j] = processed_data



In [24]:
test_sets

{'test_base': <small_text.data.datasets.SklearnDataset at 0x17f5b3790>,
 'test_50': <small_text.data.datasets.SklearnDataset at 0x280a96950>,
 'test_10': <small_text.data.datasets.SklearnDataset at 0x2835b83d0>,
 'test_5': <small_text.data.datasets.SklearnDataset at 0x106af7250>}

In [74]:
results_dict = {}
predictions_dict = {}
# Run experiment n times
# for run in range(args.run_n):
run = 0
seed_value = run
random.seed(seed_value)
np.random.seed(seed_value)
print(f'----RUN {run}: {args.method} LEARNER----')

indices_neg_label = np.where(train.y == 0)[0]
indices_pos_label = np.where(train.y == 1)[0]

# remove and check
# all_indices = np.concatenate([indices_neg_label, indices_pos_label])
# np.random.shuffle(all_indices)
# 

# x_indices_initial = all_indices.astype(int)
# y_initial = np.array([train.y[i] for i in x_indices_initial])
# print(f'Starting imbalance: {np.round(np.mean(y_initial),2)}')
# val_indices = None

train_set = SklearnDataset(train.x, train.y)
clf.fit(train_set)




----RUN 0: SL LEARNER----




<small_text.classifiers.classification.SklearnClassifier at 0x283ab6c50>

In [78]:
y_train_pred = clf.predict(train_set)
train_score = f1_score(train.y, y_train_pred, average='macro', zero_division = 0)
train_score

0.9872499563629756

In [80]:
for k, v in test_sets.items():
    test_set = SklearnDataset(v.x, v.y)
    y_test_pred = clf.predict(test_set)
    test_score = f1_score(v.y, y_test_pred, average='macro', zero_division = 0)
    print(f'{k}: {test_score}')

test_base: 0.7754274721554835
test_50: 0.8752674392237094
test_10: 0.7878238532233632
test_5: 0.7065843471364379




In [79]:
test_sets

{'test_base': <small_text.data.datasets.SklearnDataset at 0x283c9ac50>,
 'test_50': <small_text.data.datasets.SklearnDataset at 0x2839f0e10>,
 'test_10': <small_text.data.datasets.SklearnDataset at 0x283830d90>,
 'test_5': <small_text.data.datasets.SklearnDataset at 0x155204a10>}

In [58]:
train_df.iloc[10079]

text                 ` Who moved the section on ``public support`` ...
label                                                                0
matches                                                             []
len_matches                                                          0
len_text                                                            22
norm_kw                                                            0.0
weak_pos_keywords                                                    0
Name: 78160832, dtype: object