# Prediction

You may use this notebook to use our models to make predictions on arbitrary text.

## Imports

In [1]:
import numpy as np
import joblib
import torch
from transformers import DistilBertForSequenceClassification, DistilBertTokenizerFast
from nltk.tokenize import word_tokenize


## Load Models and Tokenizer

In [2]:
def predict_input(text):
    # Load all models and resources
    meta_model = joblib.load("../stacking-models/meta_model.pkl")
    lr_model = joblib.load("../base-models/logistic_regression_model.pkl")
    rf_model = joblib.load("../base-models/random_forest_model.pkl")
    xgb_model = joblib.load("../base-models/gradient_boosted_model.pkl")
    svm_model = joblib.load("../base-models/support_vector_machine_model.pkl")
    lrbert_model = joblib.load("../base-models/lr_bert_model.pkl")
    tfidf_vectorizer = joblib.load("../datasets/tfidf_vectorizer.pkl")
    w2v_model = joblib.load("../datasets/word2vec_model.model")
    idf_weights = dict(zip(tfidf_vectorizer.get_feature_names_out(), tfidf_vectorizer.idf_))

    device = "cpu"
    bert_model = DistilBertForSequenceClassification.from_pretrained("../base-models/finetuned_bert")
    bert_model.to(device)
    bert_model.eval()
    tokenizer = DistilBertTokenizerFast.from_pretrained("../base-models/finetuned_bert")

    # TF-IDF
    tfidf_feat = tfidf_vectorizer.transform([text])

    # Weighted Word2Vec
    def get_weighted_w2v(text, model, idf_dict):
        tokens = word_tokenize(text)
        word_vecs = []
        weight_sum = 0
        for word in tokens:
            if word in model.wv and word in idf_dict:
                vec = model.wv[word] * idf_dict[word]
                word_vecs.append(vec)
                weight_sum += idf_dict[word]
        if word_vecs:
            return np.sum(word_vecs, axis=0) / weight_sum
        else:
            return np.zeros(model.vector_size)

    w2v_feat = get_weighted_w2v(text, w2v_model, idf_weights).reshape(1, -1)
    full_feat = np.hstack([tfidf_feat.toarray(), w2v_feat])

    # BERT Embedding
    inputs = tokenizer(text, padding=True, truncation=True, return_tensors="pt", max_length=128)
    inputs = {k: v.to(device) for k, v in inputs.items()}
    with torch.no_grad():
        outputs = bert_model(**inputs, output_hidden_states=True)
        logits = outputs.logits
        last_hidden_state = outputs.hidden_states[-1]
        attention_mask = inputs['attention_mask']
        input_mask_expanded = attention_mask.unsqueeze(-1).expand(last_hidden_state.size()).float()
        sum_embeddings = torch.sum(last_hidden_state * input_mask_expanded, 1)
        sum_mask = torch.clamp(input_mask_expanded.sum(1), min=1e-9)
        mean_embedding = (sum_embeddings / sum_mask).cpu().numpy()

    # Construct meta-features
    meta_input = np.zeros((1, 6))
    meta_input[:, 0] = lr_model.predict_proba(tfidf_feat)[:, 1]
    meta_input[:, 1] = rf_model.predict_proba(tfidf_feat)[:, 1]
    meta_input[:, 2] = xgb_model.predict_proba(full_feat)[:, 1]
    meta_input[:, 3] = svm_model.decision_function(tfidf_feat)
    meta_input[:, 4] = lrbert_model.predict_proba(mean_embedding)[:, 1]
    meta_input[:, 5] = torch.softmax(logits, dim=1)[:, 1].cpu().numpy()

    # Predict with meta-model
    prediction = meta_model.predict(meta_input)[0]
    return prediction

## Make predictions on arbitrary text

Where 0 means the text is not describing a natural disaster and 1 means the text is describing a natural disaster.

In [3]:
prediction = predict_input("There is a wildfire raging in southern California.")
print("Predicted that text is describing a natural disaster:", prediction)

Predicted that text is describing a natural disaster: 1


In [4]:
prediction = predict_input("What a peaceful day!")
print("Predicted that text is describing a natural disaster:", prediction)

Predicted that text is describing a natural disaster: 0
