In [12]:
from dotenv import dotenv_values
import requests
import json
import random
import numpy as np
import pickle
import re
from nltk import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
from nltk.tokenize import RegexpTokenizer
from data import DataModule
from models import Transformer
from transformers import RobertaTokenizerFast
import torch
from tqdm import tqdm
import numpy as np
en_stopwords = stopwords.words('english')

config = dotenv_values("../.env")  # config = {"USER": "foo", "EMAIL": "foo@example.org"}
debug_server = config['DEBUG_SERVER']
token = config['TOKEN']
submitt_url = config['DEBUG_SERVER_SUBMIT']
HEADER = {'content-type': 'application/json'}

# Models

## ML Model

In [2]:
def remove_whitespace(text):
   """ Removes all whitespaces from a given text."""
   return " ".join(text.split())

def remove_URLs(text):
   "Remove URLs from text using regular expressions."
   url_re = re.compile(r'https?://\S+|www\.\S+')
   return url_re.sub(r'', text)

def remove_punctuation(text):
   tokenizer = RegexpTokenizer(r'\w+')
   no_punct = tokenizer.tokenize(" ".join(text))
   return no_punct

def remove_stopwords(text):
   """Removes english stopwords."""
   result = []
   for token in text:
      if token not in en_stopwords:
         result.append(token)

   return result

def stemming(text):
   porter = PorterStemmer()
   result = []
   for word in text:
      result.append(porter.stem(word))

   return result

def ml_text_processing(text):
    text = text.lower()
    text = remove_whitespace(text)
    text = remove_URLs(text)
    text = word_tokenize(text)
    text = remove_punctuation(text)
    text = remove_stopwords(text)
    text = stemming(text)
    text = " ".join(text)
    return [text]

In [3]:
class MLPipeline():
    def __init__(self,path_model, path_text_processor, proc_func):
        self.text_preprocessing = proc_func
        self.processor = pickle.load(open(path_text_processor, 'rb'))
        self.model = pickle.load(open(path_model, 'rb'))
    def predict(self, text):
        return  int(self.model.predict(self.processor.transform(self.text_preprocessing(text))))
    def process(self, text):
        return self.processor.transform(self.text_preprocessing(text))

In [4]:
cv_path = r"../models/ml_models/CV.sav"
tfidf_path = r"../models/ml_models/TFIDF.sav"
model1 = MLPipeline(r"../models/ml_models/m1_LR_CV.sav", cv_path, ml_text_processing)
model2 = MLPipeline(r"../models/ml_models/m2_LR_TFIDF.sav", tfidf_path, ml_text_processing)
model3 = MLPipeline(r"../models/ml_models/m3_RFC_CV.sav", cv_path, ml_text_processing)


In [5]:
model4 = MLPipeline(f"../models/ml_models/m4_RFC_TFIDF.sav", tfidf_path, ml_text_processing)

## DL Model

In [6]:
model_path = r"../models/transformer_32/model-19-0.22.ckpt"
tokenizer_path =  r"../models/roberta-tokenizer"

In [7]:
class DLModel():
    def __init__(self, model_path, tokenizer_path):
        self.model = Transformer(
                        ntokens=30000,
                        emsize=128,
                        d_hid=128,
                        nlayers=1,
                        nhead=2,
                        dropout=0.2,).load_from_checkpoint(model_path)
        self.model.eval()
        self.tokenizer = RobertaTokenizerFast.from_pretrained(tokenizer_path)
    def predict(self,text):
        text = self.tokenizer.encode_plus(
                text,
                # add_special_tokens=True,
                max_length=512,
                padding="max_length",
                return_tensors="pt",
                return_attention_mask=False,
                truncation=True,
                )
        output = torch.sigmoid(self.model(text['input_ids'])).detach().numpy()
        return np.heaviside(output, 0)

In [8]:
model0 = DLModel(model_path=model_path, tokenizer_path=tokenizer_path)

file ../models/roberta-tokenizer/config.json not found
file ../models/roberta-tokenizer/config.json not found


# Official Submission

## Workflow
```
getWritings
while request not empty:
    for run in runs:
        POST request
    getWritings
```

In [14]:
# GET_URL = 'http://127.0.0.1:5000'
GET_URL = config['T1_SERVER'] + token
systems = [model0, model1, model2, model3, model4]
counter = 20 # THIS NEEDS TO BE UPDATED MANUALLY WHEN STOPPING THE LOOP
ans = requests.get(GET_URL)
ans_dict = json.loads(ans.text)
while ans_dict != []:
    counter += 1
    for run, system in tqdm(enumerate(systems), desc=f"Submission count {counter}"):
        post_url = config['T1_SUBMISSION'] + token + "/" + str(run)
        submission = []
        for ans in ans_dict:
            nick = ans['nick']
            text = ans['content']
            submission.append({
                'nick': nick,
                'decision': system.predict(text),
                'score': round(random.uniform(0,4),1)
            })
        post_request = requests.post(post_url, data = json.dumps(submission), headers = HEADER)
    ans = requests.get(GET_URL)
    ans_dict = json.loads(ans.text)

Submission count 2: 0it [00:11, ?it/s]


KeyboardInterrupt: 