In [4]:
import spacy
import numpy as np

In [6]:
%store -r w2v_model
# from word2vec.ipynb

In [8]:
%store -r doc2vec_final
# from doc2vec.ipynb

In [9]:
%store -r lda
# from nltk_coll_topic.ipynb

In [10]:
%store -r lda_vectorizer
# from nltk_coll_topic.ipynb

In [3]:
def preprocess_text(text):
    nlp = spacy.load("en_core_web_sm")
    doc = nlp(text)
    # set lowercase
    processed_text = [token.text.lower()for token in doc]
    return processed_text

In [5]:
def w2v_preprocess(example, model):
    ex_tok = preprocess_text(example)
    # create the vectors from the w2v model
    ex_vectors = []

    for word in ex_tok:
        if word in w2v_model.wv:
            ex_vectors.append(w2v_model.wv[word])
        else:
            ex_vectors.append(np.zeros(150)) #if the word is not in the model i append an array of zeros equal to the embeddings dimension, that is 150
        
    return ex_vectors



In [7]:
from gensim.models.phrases import Phrases, Phraser

In [11]:
def d2v_topic_preprocess(example, d2v_model, topic_model, topic_vectorizer):

    # first we extract the topics
    ex_tf = topic_vectorizer.tranform(example)
    ex_topics = topic_model.transform(ex_tf)

    # we then extract the d2v vectors
    ex_tok = preprocess_text(example)
    ex_d2v = d2v_model.infer_vector(ex_tok)

    # we now concatenate them
    ex_d2v_topic_vec = np.array(np.concatenate(ex_topics, ex_d2v))
    
    return ex_d2v_topic_vec



In [19]:
from keras.models import load_model
from keras.utils import pad_sequences

# load the model
lstm_model = load_model('lstm_model.h5')
cnn_bce_model = load_model("cnn_bce_model.h5")




In [14]:
from sklearn.preprocessing import StandardScaler

In [16]:
%store -r input
# from undersampling.ipynb, a list of vectors obtained by concatenating for each document its doc2vec vector and a vector with the topic modeling probabilities

In [17]:
# we fit the scaler for the cnn on the training set
X_train = np.asarray(input)
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)

In [None]:
def get_inference(example, model_type):

    if model_type == "lstm":
        ex_vectors = w2v_preprocess(example, w2v_model)
        ex_vectors = np.array(ex_vectors)
        # setting max_len from the training set
        max_len = 163
        X_ex = pad_sequences(ex_vectors, max_len)
        y_pred = lstm_model.predict(X_ex)

    elif model_type == "cnn":
        ex_vectors = d2v_topic_preprocess(example, doc2vec_final, lda, lda_vectorizer)
        ex_vectors = np.array(ex_vectors)
        X_ex = scaler.transform(ex_vectors)
        # Reshape 'X_ex' to add an extra dimension
        X_ex = X_ex.reshape((1, 1, X_ex.shape[0]))
        y_pred = cnn_bce_model.predict(X_ex)

    # we now turn the predictions in binary labels
    threshold = 0.5
    y_pred_binary = (y_pred > threshold).astype(int)
    return y_pred_binary
