# Active Learner
Using Parameters from Park, Lee & Moon  
Pool based, SVM, inductive -> those parameters are standard  
Initial set: Normally, k-means is used. PLM recommend their own algorithm, LCA.  
After all documents are labeled, the corpus becomes input to the Sentiment Lexicon Extraction Model (SLE-BE), which creates the domain-specific lexicon.

In [113]:
import pandas as pd
import numpy as np

In [2]:
data = pd.read_pickle("./data/cleaned_submissions.pkl")

In [3]:
text = data["text"]
labels = data["sentiment"]

# Get labeled instances
indices = []
for i in range(0, len(labels), 10):
    indices.append(i)
    
# Set seed
X_seed = text[indices]
y_seed = labels[indices]

In [4]:
# Get unlabeled instances
new_indices = []
for i in range(0, len(data["text"].index)):
    if i not in indices:
        new_indices.append(i)

In [5]:
# Encode labels
from sklearn.preprocessing import LabelEncoder
lenc = LabelEncoder()
y_train = lenc.fit_transform(y_seed)

# Vectorize text using tfidf
from sklearn.feature_extraction.text import TfidfVectorizer
tfidf = TfidfVectorizer(preprocessor=' '.join, lowercase=False, min_df=5) # min_df = Minimum occurance of words
X_train = tfidf.fit_transform(data["text"])

In [6]:
y_train_seed = y_train # To feed the Active Leaner
X_train_seed = X_train[indices] # To feed the Active Learner
X_train_pool = X_train[new_indices] # Instances that need to be labeled

In [7]:
from modAL.models import ActiveLearner
from modAL.uncertainty import uncertainty_sampling
from sklearn.svm import SVC

learner = ActiveLearner(
    estimator = SVC(probability=True),
    query_strategy = uncertainty_sampling,
    X_training = X_train_seed,
    y_training = y_train_seed
)

In [8]:
unqueried_score = learner.score(X_train_seed, y_train_seed)
unqueried_score

0.8918407128933444

In [93]:
query_idx, query_sample = learner.query(X_train_pool)

In [117]:
new_label = vader_sentiment_labels(query_sample)

In [119]:
X_train_pool[query_idx][0].shape

(1, 19267)

In [121]:
X_train_seed[0].shape

(1, 19267)

In [118]:
learner.teach(X = X_train_pool[query_idx][0], y = new_label, only_new=True)

TypeError: Singleton array array(2) cannot be considered a valid collection.

In [116]:
# Using Vader
from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer
analyzer = SentimentIntensityAnalyzer()

# for sentence in data["text"][indices][:30]:
#     vs = analyzer.polarity_scores(" ".join(sentence))
#     print("{:} {}".format(" ".join(sentence), str(vs)))
#     print("-"*40)


def vader_sentiment_labels(query_text):
    sentence = tfidf.inverse_transform(query_text)
    vs = analyzer.polarity_scores(" ".join(sentence[0]))
    if vs["compound"] > 0.05:
        return np.int(2) # Positive Sentiment
    elif vs["compound"] < -0.05:
        return np.int(0) # Negative Sentiment
    else:
        return np.int(1) # Neutral Sentiment

# Flair

In [35]:
# Flair
# https://www.analyticsvidhya.com/blog/2019/02/flair-nlp-library-python/ #
import flair
import torch

In [38]:
## Importing the Embeddings ##
from flair.embeddings import WordEmbeddings
from flair.embeddings import CharacterEmbeddings
from flair.embeddings import StackedEmbeddings
from flair.embeddings import FlairEmbeddings
from flair.embeddings import BertEmbeddings
from flair.embeddings import ELMoEmbeddings
from flair.embeddings import FlairEmbeddings

### Initialising embeddings (un-comment to use others) ###
#glove_embedding = WordEmbeddings('glove')
#character_embeddings = CharacterEmbeddings()
#flair_forward  = FlairEmbeddings('news-forward-fast')
flair_backward = FlairEmbeddings('news-backward-fast')
#bert_embedding = BertEmbedding()
#elmo_embedding = ElmoEmbedding()

stacked_embeddings = StackedEmbeddings( embeddings = [ flair_backward ])

In [None]:
from tqdm import tqdm ## tracks progress of loop ##

# creating a tensor for storing sentence embeddings #
s = torch.zeros(0,z)

# iterating Sentence (tqdm tracks progress) #
for tweet in tqdm(txt):   
  # empty tensor for words #
  w = torch.zeros(0,z)   
  sentence = Sentence(tweet)
  stacked_embeddings.embed(sentence)
  # for every word #
  for token in sentence:
    # storing word Embeddings of each word in a sentence #
    w = torch.cat((w,token.embedding.view(-1,z)),0)
  # storing sentence Embeddings (mean of embeddings of all words)   #
  s = torch.cat((s, w.mean(dim = 0).view(-1, z)),0)