### Check 1
- Check if question is longer than 19 characters
- Check if answer is longer than 50 characters
- Replace bad characters in question and answer

In [None]:
import json
from tqdm import tqdm

with open("../scraper_viva/viva.json") as f:
    data = json.load(f)


viva_data = []
len_answer_question = 0
bad_chars = 0

with tqdm(total=len(data), desc="Checking ...") as pbar:
    for ans in data:
        if any(char in ans["question"] for char in ["\n", '"', "-"]) or any(
            char in ans["answer"] for char in ["\n", '"', "-"]
        ):
            bad_chars += 1

        question = (
            ans["question"].replace("\n", " ").replace('"', "").replace("-", " ").replace("_", " ")
        )
        answer = (
            ans["answer"].replace("\n", " ").replace('"', "").replace("-", " ").replace("_", " ")
        )

        if len(question) > 19 and len(answer) > 50:
            viva_data.append(
                {
                    "url": ans["url"],
                    "question": question,
                    "answer": answer,
                    "time": ans["time"],
                    "title": ans["title"],
                }
            )
        else:
            len_answer_question += 1

        pbar.set_postfix({"len_answer_question": len_answer_question, "bad_chars": bad_chars})

        pbar.update(1)

In [5]:
with open("viva-data.json", "w", encoding="utf-8") as f:
    json.dump(viva_data, f, ensure_ascii=False, indent=4)

### Check 2
- Checking grammar mistakes in question and answer

In [None]:
from tqdm import tqdm
import language_tool_python
import json

tool = language_tool_python.LanguageTool("sl")
new_data_viva = []
bad_sentence = 0
bad_sentence_data = []

with open("viva-data.json") as f:
    data = json.load(f)

with tqdm(total=len(data), desc="Checking profanity") as pbar:
    for ans in data:
        question = ans["question"]
        answer = ans["answer"]
        check_sentence_question = tool.check(question)
        check_sentence_answer = tool.check(answer)

        if len(check_sentence_question) <= 20 and len(check_sentence_answer) <= 20:
            new_data_viva.append(ans)
        else:
            bad_sentence_data.append(
                {
                    "url": ans["url"],
                    "question_err": check_sentence_question,
                    "answer_err": check_sentence_answer,
                    "question": question,
                    "answer": answer,
                }
            )
            bad_sentence += 1

        pbar.set_postfix({"bad sentence": bad_sentence})
        pbar.update(1)

tool.close()

In [None]:
bad_sentence_data = sorted(bad_sentence_data, key=lambda x: len(x["question_err"]) + len(x["answer_err"]), reverse=True)
bad_sentence_data[0]

In [None]:
with open("viva-data-grammar.json", "w", encoding="utf-8") as f:
    json.dump(new_data_viva, f, ensure_ascii=False, indent=4)

### Check 3
- Check if bad words are present in question and answer

In [None]:
from translate import Translator
from profanity_check import predict
from tqdm import tqdm


def translate_to_slovenian_and_check_profanity(text, translator):
    translated_text = translator.translate(text)
    profani = predict([translated_text])[0]
    return True if profani == 1 else False


if __name__ == "__main__":
    new_data_not_profanity = []
    prof_count = 0
    translator = Translator(from_lang="sl", to_lang="en")

    with open("viva-data.json") as f:
        data = json.load(f)

    with tqdm(total=len(data), desc="Checking profanity") as pbar:
        for ans in data:
            question = ans["question"]
            answer = ans["answer"]
            check_profanity_question = translate_to_slovenian_and_check_profanity(
                question, translator
            )
            check_profanity_answer = translate_to_slovenian_and_check_profanity(answer, translator)

            if not check_profanity_question and not check_profanity_answer:
                new_data_not_profanity.append(ans)
            else:
                prof_count += 1

            pbar.set_postfix({"profanity": prof_count})
            pbar.update(1)

In [None]:
with open("./final-data/iva-data-final.json", "w", encoding="utf-8") as f:
    json.dump(new_data_not_profanity, f, ensure_ascii=False, indent=4)