In [None]:
import sys
IS_GOOGLE_COLAB = 'google.colab' in sys.modules

In [None]:
if IS_GOOGLE_COLAB:
  from google.colab import drive
  drive.mount('/content/drive')
  
  %cd ./drive/MyDrive/behind_the_words

In [None]:
from utils.flask_ngrok import run_with_ngrok, NgrokAPI
import json

NGROK_API_KEY = "2XNcTz95bVnxSOh0P6pmLJgqSJb_6GFWVWhcU5jpacDu8gmCv"
ngrok_api = NgrokAPI(NGROK_API_KEY)

**Use any URL from below if it outputs any after it ran**

In [None]:
tunnels = [tunnel for tunnel in ngrok_api.get_all_tunnels() if "metadata" in tunnel["tunnel_session"]]

for tunnel in tunnels:
  tunnel["tunnel_session"]["metadata"] = json.loads(tunnel["tunnel_session"]["metadata"])

tunnels = list(map(lambda tunnel: tunnel["public_url"], filter(lambda tunnel: tunnel["tunnel_session"]["metadata"].get("for") == "behind-the-words-backend", tunnels)))

if len(tunnels) > 0:
  raise Exception(f"Found existing URLs {tunnels}")

In [None]:
if IS_GOOGLE_COLAB:
  %pip install -q -r requirements.txt
  !python -m textblob.download_corpora

In [None]:
import math
import os
import uuid

import docx
import language_tool_python
import numpy as np
import spacy
import tensorflow as tf
import xgboost
import requests
from flask import Flask, jsonify, request
from flask_cors import CORS
from PyPDF2 import PdfReader
from spellchecker import SpellChecker
from importlib import reload
from gensim import models
from utils.dir import get_latest_file_in_dir

In [None]:
BEHIND_THE_WORDS_DIR = "./"
DATA_DIR = os.path.join(BEHIND_THE_WORDS_DIR, "data")
USING_GPU = False
IS_SCIKIT_LEARN_API = False
CHARACTER_LIMIT = 2056

In [None]:
print("Loading deps")
_nlp = spacy.load("en_core_web_sm")
_spellchecker = SpellChecker()
_language_tool = language_tool_python.LanguageTool("en-US")
print("Loaded")

In [None]:
sys.path.append(os.path.join(BEHIND_THE_WORDS_DIR, "metaphor/"))

In [None]:
from utils.load_word2vec import load_word2vec

w2v_model_path = os.path.join(BEHIND_THE_WORDS_DIR, "data/gensim/word2vec-google-news-300.gz")
word2vec = load_word2vec(w2v_model_path, "http://127.0.0.1:7070")

In [None]:
import metaphor.metaphor_usage as metaphor_usage
import relevant_features as rf

reload(rf)
relevant_features = rf.RelevantFeatures(_nlp, _spellchecker, _language_tool)

reload(metaphor_usage)
MetaphorUsage = metaphor_usage.MetaphorUsage
metaphor_usage = MetaphorUsage(relevant_features.get_nlp, USING_GPU)
metaphor_usage.load_model(
    os.path.join(BEHIND_THE_WORDS_DIR, "metaphor/models/metaphor_usage_model.pt")
)
metaphor_usage.load_word2vec(word2vec)

relevant_features.set_metaphor_usage_instance(metaphor_usage)

In [None]:
# Uses the latest model
RF_MODEL_PATH = get_latest_file_in_dir("./models/rf/")
print(f"Importing latest RF Model {RF_MODEL_PATH}")
model_rf = xgboost.Booster()
model_rf.load_model(RF_MODEL_PATH)

In [None]:
# Uses the latest model
CNN_MODEL_PATH = get_latest_file_in_dir("./models/cnn/")
print(f"Importing latest CNN Model {CNN_MODEL_PATH}")
model_cnn = tf.keras.saving.load_model(CNN_MODEL_PATH, compile=True, safe_mode=True)

In [None]:
# Uses the latest model
RF_CNN_MODEL_PATH = get_latest_file_in_dir("./models/rf-cnn/")
print(f"Importing latest RF-CNN Model {RF_CNN_MODEL_PATH}")
model_rf_cnn = xgboost.Booster()
model_rf_cnn.load_model(RF_CNN_MODEL_PATH)

In [None]:
app = Flask(__name__)
CORS(app)
run_with_ngrok(app, {
    "authtoken": "2XNDZFUnWNh5lkgyVg0GsvX1ycN_499Tjj2dimucQtnrQHFuK",
    "metadata": json.dumps({ "for": "behind-the-words-backend" }),
    "version": "2",
})

In [None]:
def make_dataset(X, y=None, is_scikit_learn_api=IS_SCIKIT_LEARN_API):
    if is_scikit_learn_api:
        return X
    else:
        return xgboost.DMatrix(X, label=y)


def predict_rf(txt):
    _rf = relevant_features.get(txt)
    result = model_rf.predict(make_dataset([_rf]))

    return result[0]


def process_text(text):
    MAX_TOKEN_LENGTH = 384

    doc = _nlp(text)
    words = [token.lower_ for token in doc]
    embeddings = word2vec.get_vec(words).tolist()[:MAX_TOKEN_LENGTH]
    padding = [[0] * 300] * (MAX_TOKEN_LENGTH - len(embeddings))

    return embeddings + padding


def sigmoid(x):
    return 1 / (1 + math.exp(-x))


def cnn_prediction_func(prediction):
    return prediction


def predict_cnn(model_cnn, text):
    data = np.array([process_text(text)])
    data = data.reshape(-1, *data.shape[1:], 1)

    return model_cnn.predict(data, verbose=None).tolist()[0][0]


def predict_rf_cnn(txt):
    _inputs = relevant_features.get(txt)

    cnn_prediction = predict_cnn(model_cnn, txt)
    _inputs = np.array([*_inputs, cnn_prediction_func(cnn_prediction)])

    result = model_rf_cnn.predict(make_dataset([_inputs]))

    return result[0], cnn_prediction

In [None]:
ALLOWED_EXTENSIONS = {"txt", "pdf", "docx", "doc"}


def allowed_file(filename):
    return "." in filename and filename.rsplit(".", 1)[1].lower() in ALLOWED_EXTENSIONS


def file_extension(filename):
    return filename.rsplit(".", 1)[1].lower()


@app.route("/read-file", methods=["POST"])
def read_file():
    if "file" not in request.files:
        return jsonify({"text": ""})
    file = request.files["file"]
    if file.filename == "":
        return jsonify({"text": ""})
    if file and allowed_file(file.filename):
        filename = str(uuid.uuid4())
        file_ext = file_extension(file.filename)

        text = ""

        if file_ext == "txt":
            text = file.read().decode('utf-8')
        elif file_ext == "pdf":
            reader = PdfReader(file)

            for page in reader.pages:
                text += page.extract_text()
        elif file_ext in ["doc", "docx"]:
            doc = docx.Document(file)
            text = "\n".join([p.text for p in doc.paragraphs])

        return jsonify({"text": text})

    return jsonify({"text": ""})

In [None]:
@app.route("/rf", methods=["POST"])
def predict_rf_handler():
    content = request.json

    text = content["text"]

    text = text[:CHARACTER_LIMIT]

    probability = predict_rf(text)

    return jsonify(
        {
            "probability": probability.item(),
            "results": relevant_features._get_features_raw(text),
        }
    )


@app.route("/rf-cnn", methods=["POST"])
def predict_rf_cnn_handler():
    content = request.json

    text = content["text"]

    text = text[:CHARACTER_LIMIT]

    probability, cnn_prediction = predict_rf_cnn(text)

    return jsonify(
        {
            "probability": probability.item(),
            "results": [
                *relevant_features._get_features_raw(text),
                {
                    "id": "cnn_prediction",
                    "result": {
                        "cnn_prediction": cnn_prediction,
                    }
                },
            ],
        }
    )

In [None]:
def get_ngrok_sessions():
  return requests.get("https://api.ngrok.com/tunnel_sessions", headers={
      "Authorization": f"Bearer {NGROK_API_KEY}",
      "Ngrok-Version": "2"
  }).json()["tunnel_sessions"]

def kill_ngrok_sessions():
  sessions = get_ngrok_sessions()

  for session in sessions:
    requests.post(f"https://api.ngrok.com/tunnel_sessions/{session['id']}/stop", json={}, headers={
        "Authorization": f"Bearer {NGROK_API_KEY}",
        "Content-Type": "application/json",
        "Ngrok-Version": "2"
    })

In [None]:
kill_ngrok_sessions()

app.run(port=6060)