In [1]:
import nltk
import faiss
import importlib
import numpy as np
import pandas as pd
import io
import modules
import pickle
import sys
import numpy
import logging
import modules.ActiveLearning as ActiveLearning
import modules.preprocess as preprocess
from modules.ModelWrap import ModelWrap
from modules.ActiveLearningBase import ActiveLearningBase
from modules.Suggester import Suggester
from modules.Suggest import Suggest
importlib.reload(modules)
importlib.reload(modules.Suggest)
importlib.reload(modules.Suggester)
importlib.reload(modules.ActiveLearningBase)
importlib.reload(modules.ActiveLearning)
importlib.reload(preprocess)
from tqdm import tqdm
from joblib import dump, load
from multiprocessing import Pool
from sklearn import preprocessing
from nltk import WordNetLemmatizer
from sklearn.decomposition import TruncatedSVD
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score
from scipy.stats import entropy


np.set_printoptions(threshold=100000)
random_seed = 42

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\TOPAPEC\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\TOPAPEC\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [2]:
# This time we are using reddit self-post classification dataset https://www.kaggle.com/mswarbrickjones/reddit-selfposts/?select=subreddit_info.csv
# We will assume that post title and post body can be concatenated into one text column.
embeddings_dim = 300
tqdm.pandas()

def preprocess_and_save_dataset(dataset_path, preprocessed_path):
    dataframe = pd.read_csv(dataset_path, sep='\t')
    dataframe.loc[:, "text"] = dataframe.loc[:, "title"] + " " + dataframe.loc[:, "selftext"]
    dataframe = dataframe.drop(["title", "selftext"], axis=1)
    preprocess_pipeline(dataframe)
    dataframe.to_pickle(preprocessed_path)
    del dataframe

def vectorize_and_save_dataset(pickle_path, output_path, output_path_labels):
    dataset = pd.read_pickle(pickle_path)
    vectorized = np.zeros((dataset.shape[0], embeddings_dim))
    embeddings = get_glove_reddit_embeddings()
    for i, (vec, row) in enumerate(zip(vectorized, dataset.iterrows())):
        vectorized[i] = preprocess.row_to_embedding(row, embeddings, embeddings_dim)
    del embeddings
    with open(output_path, "wb") as file:
        np.save(file, vectorized)
    with open(output_path_labels, "wb") as file:
        np.save(file, dataset.loc[:, "subreddit"].to_numpy())
    del dataset

    
def preprocess_pipeline(dataset):
    cores = 12
    multicore_tok(dataset, cores)
    wnl = WordNetLemmatizer()
    dataset.loc[:, "text"].progress_apply(lemmatize_sent, args=[wnl])

def lemmatize_sent(wordlist, wnl):
    return ' '.join([wnl.lemmatize(w) for w in wordlist])
    
def multicore_tok(dataset, cores=6):
    with Pool(processes=cores) as pool:
        dataset.loc[:, "text"] = pool.map(nltk.word_tokenize, dataset.loc[:, "text"])

def multicore_lem(dataset, cores=6):
    with Pool(processes=cores) as pool:
        wnl = WordNetLemmatizer()
        for i, line in tqdm(enumerate(dataset.text)):
            dataset.loc[i,"text"] = pool.map(wnl.lemmatize, dataset.loc[i, "text"])
            
def get_glove_reddit_embeddings():
    # Number of words - 1623397 
    embeddings = {}
    tmp = []
    with io.open("GloVe.Reddit.120B.300D.txt", "r", encoding='utf-8') as file:
        file.readline()
        for line in tqdm(file, total=1623397):
            tmp.append(line)
    with Pool(processes=14) as pool:
        tmp = list(tqdm(pool.imap(preprocess.fetch_embeddings_value, tmp, chunksize=200000), total=1623397))
    for word, vector in tqdm(tmp):
        embeddings[word] = vector
    del tmp
    return embeddings

  from pandas import Panel


In [None]:
dataset_path = "selfpost/rspct.tsv"
preprocessed_path = "selfpost/preprocessed.pkl"
preprocess_and_save_dataset(dataset_path, preprocessed_path)

In [44]:
vectorized_output_path = "selfpost/vectorized.npy"
vectorized_labels_output_path = "selfpost/vectorized_labels.npy"
vectorize_and_save_dataset(preprocessed_path, vectorized_output_path, vectorized_labels_output_path)

In [4]:
from sklearn.metrics import log_loss

class LinearModel(ModelWrap):
    def __init__(self, model):
        self.model = model
        self.last_loss = 0
        
    def train(self, X, y):
        self.model.train(X,y)
    
    def predict(self, X):
        return self.model.predict(X)
    
    def predict_proba(self, X):
        return self.model.predict_proba(X)
    
    def get_last_loss(self, X_train, y_train):
        return log_loss(y_train, model.predict_proba(X_train), eps=1e-15)
        

In [2]:
from scipy.stats import entropy



class ConfidenceSamplingSuggestion(ActiveLearningBase):
    def __init__(self, n_top=1000):
        self.n_top = n_top

    def get_samples_for_labeling(self, model, X_test, y_test):
        y_proba = model.predict_proba(X_test)
        y_proba = np.max(y_proba, axis=1)
        ind = np.lexsort((y_test, y_proba))
        return "oracle", ind[:min(self.n_top, y_proba.shape[0])]

class MarginSampling(ActiveLearningBase):
    def __init__(self, n_top=1000):
        self.n_top = n_top
        
    def get_samples_for_labeling(self, model, X_test, y_test):
        y_proba = model.predict_proba(X_test)
        y_proba = np.sort(y_proba, axis=1)[:,::-1]
        y_proba = y_proba[:,0] - y_proba[:,1]
        ind = np.lexsort((y_test, y_proba))
        return "oracle", ind[:min(self.n_top, y_proba.shape[0])]

class EntropySampling(ActiveLearningBase):
    def __init__(self, n_top=1000):
        self.n_top = n_top
        
    def get_samples_for_labeling(self, model, X_test, y_test):
        y_proba = model.predict_proba(X_test)
        y_proba = entropy(y_proba, axis=1)
        ind = np.lexsort((y_test, y_proba))[::-1]
        print(y_proba[ind[0]], y_proba[ind[-1]])
        return "oracle", ind[:min(self.n_top, y_proba.shape[0])]

In [2]:
from sklearn import preprocessing
le = preprocessing.LabelEncoder()
vectorized_output_path = "selfpost/vectorized.npy"
vectorized_labels_output_path = "selfpost/vectorized_labels.npy"
with open(vectorized_output_path, "rb") as vect_X, open(vectorized_labels_output_path, "rb") as vect_y:
    X = np.load(vect_X, allow_pickle=True)
    y = np.load(vect_y, allow_pickle=True)
le.fit(y)
y = le.transform(y)
del le

In [11]:
model_path = "selfpost/models/logreg_100it"
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.8, random_state=random_seed)
model = LogisticRegression(random_state=random_seed, n_jobs=-1, verbose=True)
model.fit(X_train, y_train)
dump(model, model_path)

[Parallel(n_jobs=-1)]: Using backend LokyBackend with 16 concurrent workers.
[Parallel(n_jobs=-1)]: Done   1 out of   1 | elapsed: 22.3min finished


['selfpost/models/logreg_100it']

In [3]:
model_path = "selfpost/models/logreg_100it"
model = load(model_path)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.8, random_state=random_seed)
X_train, y_train

(array([[ 0.06228423,  0.06092767, -0.14749891, ..., -0.3225482 ,
          0.14105452,  0.00854856],
        [ 0.08068533, -0.02982698, -0.12298278, ..., -0.27350412,
          0.05903656,  0.0867618 ],
        [ 0.03988495,  0.06417522, -0.20442137, ..., -0.29071138,
          0.03378086, -0.01860267],
        ...,
        [ 0.15182757,  0.10873834, -0.17133273, ..., -0.17019229,
          0.15758706,  0.04102596],
        [ 0.04931326,  0.03915333, -0.15517242, ..., -0.30393526,
          0.0845803 , -0.00351562],
        [ 0.09860376,  0.01065233, -0.13773885, ..., -0.28388423,
          0.1883342 , -0.11135513]]),
 array([470, 445, 853, ...,   9, 288, 667]))

In [None]:
wrappedModel = LinearModel(model)
confSuggest = ConfidenceSamplingSuggestion(10000)
sampl_ind = rng.choice(X_test.shape[0], 1000, replace=False)
ind = confSuggest.get_samples_for_labeling(model, X_test[sampl_ind, :], y_test[sampl_ind])

print(sampl_ind[ind[1]])

In [77]:
test = np.array([[1, 1], [2, 1]])
test_y = np.array([2, 4])

y_labels = np.array([1, 2, 3])
y_proba = np.array([[1, 0], [111, 1], [10, -2]])
y_proba = entropy(y_proba, axis=1)
ind = np.lexsort((y_proba, y_proba))
print(ind)

y_proba = np.array([[1, 0], [111, 1], [10, -2]])

test = np.append(test, y_proba[ind[0:2]], axis=0)
test

[2 0 1]


array([[ 1,  1],
       [ 2,  1],
       [10, -2],
       [ 1,  0]])

In [120]:
logging.getLogger().setLevel(logging.INFO)
sug = Suggester(X,y)


In [90]:
suggest_alg = EntropySampling(100)
sug.active_learning_suggest(suggest_alg, LinearModel(model), sample_ratio=0.01)

INFO:root:Looking at 8102 samples from test.


6.7792310653056544 8.090449338436718e-13


In [91]:
sug.last_suggest.indices

array([263663, 556963, 162239,  32687, 534678, 123910, 376963, 676468,
       232583, 569997, 102716, 393583, 645799, 105063, 738217, 767381,
       723656, 759143, 378407, 476941,  88606, 138546, 295977, 376829,
       185133, 183306, 299197, 167503, 128336, 406411, 341994, 229912,
       322558, 309247, 102168, 161123, 167053, 782591, 624216, 293301,
       495366, 319562, 602956,   5770, 205171, 627581, 189452, 194699,
       532958,   8917, 637468, 141753, 375895, 306330, 312546, 410753,
        36320, 772195, 399059, 795146, 142815, 580262, 405912,  42023,
       635218,   4919, 788857, 644924, 772022, 780991, 498361, 464755,
       586466,  40846, 517110, 786113, 320149, 306214, 610751, 782525,
       162965, 274700,  36375, 181143, 406103, 693918, 799682, 768447,
       142749,  46536, 604493,  93185, 282442, 147943, 483390, 623975,
       197345, 522718, 629541, 210976])

In [92]:
samples_to_move = sug.X_test[sug.last_suggest.indices]
samples_to_move.shape, sug.X_train.shape, sug.X_test.shape

((100, 300), (202800, 300), (810200, 300))

In [93]:
sug.apply_last_suggest()

In [94]:
sug.X_train.shape, sug.X_test.shape

((202900, 300), (810100, 300))

In [131]:
class PseudoLabeling(ActiveLearningBase):
    def __init__(self, n_top=1000):
        self.n_top = n_top

    def get_samples_for_labeling(self, model, X_test, y_test):
        y_proba = model.predict_proba(X_test)
        max_ind = np.argmax(y_proba, axis=1)
        y_proba = np.max(y_proba, axis=1)
        ind = np.lexsort((max_ind, y_proba))[::-1]
        ind_to_return = ind[:min(self.n_top, y_proba.shape[0])]
        return "relabeling", ind_to_return, max_ind[ind_to_return]

In [134]:
suggest_alg = PseudoLabeling(100)
sug.active_learning_suggest(suggest_alg, LinearModel(model), sample_ratio=0.01)

INFO:root:Looking at 8104 samples from test.


In [None]:
samples_to_move = sug.X_test[sug.last_suggest.indices]
samples_to_move.shape, sug.X_train.shape, sug.X_test.shape

In [136]:
sug.apply_last_suggest()



Let's benchmark our solutions.

In [2]:
vectorized_output_path = "selfpost/vectorized.npy"
vectorized_labels_output_path = "selfpost/vectorized_labels.npy"
with open(vectorized_output_path, "rb") as vect_X, open(vectorized_labels_output_path, "rb") as vect_y:
    X = np.load(vect_X, allow_pickle=True)
    y = np.load(vect_y, allow_pickle=True)
model_path = "selfpost/models/logreg_100it"
model = load(model_path)

In [None]:
logging.getLogger().setLevel(logging.INFO)
sug = Suggester(X,y)
suggest_alg = ActiveLearning.EntropySampling(10000)
epochs = 5
print(sug.evaluate_metrics(LinearModel(model)))
print(sug.X_test.shape, sug.y_test.shape)
for ep in range(epochs):
    sug.active_learning_suggest(suggest_alg, LinearModel(model), sample_ratio=0.1)
    sug.apply_last_suggest()
    print(sug.X_test.shape, sug.y_test.shape)
    model.fit(sug.X_train, sug.y_train)
    print(sug.evaluate_metrics(LinearModel(model)))

INFO:root:Looking at 81040 samples from test.


{'accuracy': 0.5454084402764067, 'f1_score': 0.5411580561102263, 'precision_score': 0.5616867560333363, 'recall_score': 0.5460126540293783}
(810400, 300) (810400,)
(800400, 300) (800400,)


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 16 concurrent workers.
[Parallel(n_jobs=-1)]: Done   1 out of   1 | elapsed: 29.2min finished
INFO:root:Looking at 80040 samples from test.


{'accuracy': 0.546228135932034, 'f1_score': 0.5417091029620242, 'precision_score': 0.5615681180355546, 'recall_score': 0.5465371881251833}
(790400, 300) (790400,)


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 16 concurrent workers.


It actually took too long to train logreg from sklearn. I am going to implement one with pytorch to train on cuda.

In [4]:
import torch
import torch.nn as nn
torch.manual_seed(random_seed)


class LogReg(nn.Module):
    def __init__(self):
        super().__init__()
        self.seq = nn.Sequential(
            nn.Linear(300, 1013),
            nn.LogSoftmax(dim=1)
        )
        
    def forward(self, x):
        return self.seq(x)

In [5]:
from sklearn.metrics import log_loss
import torch.nn.functional as F


class LinearModelTorch(ModelWrap):
    def __init__(self, model, iter):
        self.model = model
        self.last_loss = 0
        self.iter = iter
        
    def train(self, X, y):
        X = torch.from_numpy(X)
        y = torch.from_numpy(y)
        X = X.view(-1, 300)
        X = X.cuda()
        y = y.cuda()
        self.model = self.model.cuda()
        self.model.train()
        optimizer = torch.optim.Adam(self.model.parameters(), lr=1e-4)
        criterion = nn.NLLLoss()
        for epoch in range(self.iter):
            optimizer.zero_grad()
            output = self.model(X.float())
            loss = criterion(output, y)
            loss.backward()
            optimizer.step()
            if (epoch % 10 == 0):
                print(f"ep{epoch}: {loss}")
    
    def predict(self, X):
        X = torch.from_numpy(X)
        X = X.view(-1, 300)
        X = X.cpu()
        self.model = self.model.cpu()
        self.model = self.model.eval()
        return torch.argmax(self.model(X.float()), axis=1)
    
    def predict_proba(self, X):
        X = torch.from_numpy(X)
        X = X.view(-1, 300)
        X = X.cpu()
        return torch.exp(self.model(X.float()))
    
    def get_last_loss(self, X_train, y_train):
        return log_loss(y_train, self.model.predict_proba(X_train), eps=1e-15)
        

In [11]:
model = LinearModelTorch(LogReg(), 100)
print(y_train.dtype)
model.train(X_train, y_train.astype(np.int64))

int32
ep0: 6.9341301918029785
ep10: 6.924700736999512
ep20: 6.916568279266357
ep30: 6.909415245056152
ep40: 6.90291166305542
ep50: 6.896817207336426
ep60: 6.89097785949707
ep70: 6.88529109954834
ep80: 6.879701614379883
ep90: 6.874173164367676


In [12]:
model_path = "selfpost/models/logreg_torch1000it"
torch.save(model.model.state_dict(), model_path)

In [6]:
model_path = "selfpost/models/logreg_torch1000it"
model = LogReg()
model.load_state_dict(torch.load(model_path))
model.eval()
model = LinearModelTorch(model, 1000)

In [7]:
logging.getLogger().setLevel(logging.INFO)
sug = Suggester(X,y)
suggest_alg = ActiveLearning.EntropySampling(10000)
epochs = 5
print(sug.evaluate_metrics(model))
print(sug.X_test.shape, sug.y_test.shape)
for ep in range(epochs):
    sug.active_learning_suggest(suggest_alg, model, sample_ratio=0.1)
    sug.apply_last_suggest()
    print(sug.X_test.shape, sug.y_test.shape)
    model.train(sug.X_train, sug.y_train)
    print(sug.evaluate_metrics(model))

  _warn_prf(average, modifier, msg_start, len(result))
INFO:root:Looking at 81040 samples from test.


{'accuracy': 0.00846248766041461, 'f1_score': 0.003575499436051318, 'precision_score': 0.029538474314824134, 'recall_score': 0.008681346212082577}
(810400, 300) (810400,)


RuntimeError: Can't call numpy() on Tensor that requires grad. Use tensor.detach().numpy() instead.