# Active Learner
Using Parameters from Park, Lee & Moon  
Pool based, SVM, inductive -> those parameters are standard  
Initial set: Normally, k-means is used. PLM recommend their own algorithm, LCA.  
After all documents are labeled, the corpus becomes input to the Sentiment Lexicon Extraction Model (SLE-BE), which creates the domain-specific lexicon.

In [1]:
import pandas as pd
import numpy as np

In [2]:
data = pd.read_pickle("./data/cleaned_submissions.pkl")

In [3]:
text = data["text"]
labels = data["sentiment"]

# Get labeled instances
indices = []
for i in range(0, len(labels), 10):
    indices.append(i)
    
# Set seed
X_seed = text[indices]
y_seed = labels[indices]

In [4]:
# Get unlabeled instances
new_indices = []
for i in range(0, len(data["text"].index)):
    if i not in indices:
        new_indices.append(i)

In [5]:
# Encode labels
from sklearn.preprocessing import LabelEncoder
lenc = LabelEncoder()
y_train = lenc.fit_transform(y_seed)

# Vectorize text using tfidf
from sklearn.feature_extraction.text import TfidfVectorizer
tfidf = TfidfVectorizer(preprocessor=' '.join, lowercase=False, min_df=5) # min_df = Minimum occurance of words
X_train = tfidf.fit_transform(data["text"])

In [7]:
y_train_seed = y_train # To feed the Active Leaner
X_train_seed = X_train[indices] # To feed the Active Learner
X_train_pool = X_train[new_indices] # Instances that need to be labeled

In [8]:
from modAL.models import ActiveLearner
from modAL.uncertainty import uncertainty_sampling
from sklearn.svm import SVC

learner = ActiveLearner(
    estimator = SVC(probability=True),
    query_strategy = uncertainty_sampling,
    X_training = X_train_seed,
    y_training = y_train_seed
)

In [None]:
# the active learning loop
n_queries = 10
for idx in range(n_queries):
    print('Query no. %d' % (idx + 1))
    query_idx, query_instance = learner.query(X_pool, n_instances=100, verbose=0)
    learner.teach(
        X=X_pool[query_idx], y=y_pool[query_idx], only_new=True,
        verbose=1
    )
    # remove queried instance from pool
    X_pool = np.delete(X_pool, query_idx, axis=0)
    y_pool = np.delete(y_pool, query_idx, axis=0)

In [46]:
unqueried_score = learner.score(X_train_seed, y_train_seed)
unqueried_score

0.8918407128933444

In [49]:
query_idx

array([28945])

In [48]:
query_idx, query_sample = learner.query(X_train_pool, only_new=True)

In [27]:
new_label = vader_sentiment_labels(query_sample)

In [23]:
print(X_train_seed)

  (0, 1450)	0.2760851148651717
  (0, 2508)	0.11781707787630487
  (0, 2943)	0.13012836367068326
  (0, 4092)	0.15106530196008486
  (0, 4835)	0.11381628723016271
  (0, 5099)	0.19049104914493728
  (0, 5221)	0.20750364190813167
  (0, 5614)	0.2327199010205387
  (0, 6199)	0.1705981500796615
  (0, 6921)	0.18531172541037932
  (0, 6967)	0.21866826181498003
  (0, 8103)	0.2589352264653529
  (0, 8421)	0.08098722664694373
  (0, 9039)	0.1194560246846339
  (0, 9739)	0.17778839240595484
  (0, 9834)	0.12541645689717978
  (0, 10511)	0.24258880843888206
  (0, 10599)	0.23586917726303752
  (0, 10845)	0.12521434608491913
  (0, 11787)	0.18808528665218463
  (0, 12066)	0.10731926861924031
  (0, 12321)	0.13985796674534032
  (0, 14158)	0.1372361240262782
  (0, 15377)	0.2020564417678004
  (0, 15421)	0.08484291482826199
  :	:
  (17952, 16129)	0.2086306888620502
  (17952, 16420)	0.07852986527476642
  (17952, 16506)	0.1494421971462161
  (17952, 16601)	0.13465720386427987
  (17952, 17205)	0.11548399676902829
  (17953,

In [22]:
print(X_train_pool[query_idx])

  (0, 8085)	0.5320041686767372
  (0, 8300)	0.0772008763417568
  (0, 8686)	0.43779329884422785
  (0, 15144)	0.5296971883218979
  (0, 17000)	0.35877644618411414
  (0, 17580)	0.3317362914079504


In [44]:
np.array([new_label])

array([1])

In [45]:
learner.teach(X = X_train_pool[query_idx], y = np.array([new_label]))

In [11]:
# Using Vader
from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer
analyzer = SentimentIntensityAnalyzer()

# for sentence in data["text"][indices][:30]:
#     vs = analyzer.polarity_scores(" ".join(sentence))
#     print("{:} {}".format(" ".join(sentence), str(vs)))
#     print("-"*40)


def vader_sentiment_labels(query_text):
    sentence = tfidf.inverse_transform(query_text)
    vs = analyzer.polarity_scores(" ".join(sentence[0]))
    if vs["compound"] > 0.05:
        return np.int(2) # Positive Sentiment
    elif vs["compound"] < -0.05:
        return np.int(0) # Negative Sentiment
    else:
        return np.int(1) # Neutral Sentiment

# Flair

In [35]:
# Flair
# https://www.analyticsvidhya.com/blog/2019/02/flair-nlp-library-python/ #
import flair
import torch

In [38]:
## Importing the Embeddings ##
from flair.embeddings import WordEmbeddings
from flair.embeddings import CharacterEmbeddings
from flair.embeddings import StackedEmbeddings
from flair.embeddings import FlairEmbeddings
from flair.embeddings import BertEmbeddings
from flair.embeddings import ELMoEmbeddings
from flair.embeddings import FlairEmbeddings

### Initialising embeddings (un-comment to use others) ###
#glove_embedding = WordEmbeddings('glove')
#character_embeddings = CharacterEmbeddings()
#flair_forward  = FlairEmbeddings('news-forward-fast')
flair_backward = FlairEmbeddings('news-backward-fast')
#bert_embedding = BertEmbedding()
#elmo_embedding = ElmoEmbedding()

stacked_embeddings = StackedEmbeddings( embeddings = [ flair_backward ])

In [None]:
from tqdm import tqdm ## tracks progress of loop ##

# creating a tensor for storing sentence embeddings #
s = torch.zeros(0,z)

# iterating Sentence (tqdm tracks progress) #
for tweet in tqdm(txt):   
  # empty tensor for words #
  w = torch.zeros(0,z)   
  sentence = Sentence(tweet)
  stacked_embeddings.embed(sentence)
  # for every word #
  for token in sentence:
    # storing word Embeddings of each word in a sentence #
    w = torch.cat((w,token.embedding.view(-1,z)),0)
  # storing sentence Embeddings (mean of embeddings of all words)   #
  s = torch.cat((s, w.mean(dim = 0).view(-1, z)),0)