In [1]:
!gdown 1OBbo21v_-esL31rxtNtsMHrA8T1JYqAd

Downloading...
From: https://drive.google.com/uc?id=1OBbo21v_-esL31rxtNtsMHrA8T1JYqAd
To: /content/core.zip
100% 538M/538M [00:04<00:00, 108MB/s] 


In [None]:
!unzip /content/core.zip

Archive:  /content/core.zip
replace th.sp.model? [y]es, [n]o, [A]ll, [N]one, [r]ename: A
  inflating: th.sp.model             
  inflating: th.sp.vocab             
  inflating: tmp_dataset.csv         
  inflating: decision_tree.sav       
  inflating: th.arpa.bin             


In [None]:
!pip install kenlm

In [None]:
!pip install sentencepiece

In [None]:
import kenlm
import math
import numpy as np
import pickle
import scipy
import sentencepiece  # type: ignore
from text_normalizer import normalize
from typing import List


class SentencesLM:
    """Returns the score of each individual paragraph."""

    def __init__(self):
        lm_config = kenlm.Config()
        lm_config.load_method = 2

        lm_model_filename = "th.arpa.bin"
        self.lm = kenlm.Model(str(lm_model_filename), lm_config)
        self.sp = sentencepiece.SentencePieceProcessor()
        self.sp.load("th.sp.model")

    def pp(self, log_score, length) -> float:
        """Compute perplexity score"""
        return 10.0 ** (-log_score / length)

    def do(self, document) -> float:  # type: ignore
        """Compute perplexity for each line of document"""
        total_pp = 0
        total_length = 0
        for line in document:
            line = normalize(line, accent=False)
            tokenized_line = " ".join(self.sp.encode_as_pieces(line))
            log_score = self.lm.score(tokenized_line)
            length = len(line.split()) + 1

            total_length += length
            total_pp += log_score
        return round(self.pp(total_pp, total_length), 1)


classifier_filename = "decision_tree.sav"
classifier = pickle.load(open(classifier_filename, "rb"))

lm = SentencesLM()


def classify_spam(text: str):
    """Classify if text is spam using perplexity and decision tree as thresholder"""

    pp_score = lm.do(text.split("\n"))

    log_pp_score = math.log(pp_score)

    prediction = classifier.predict(np.array([log_pp_score]).reshape(1, 1))

    return prediction, log_pp_score


def sample_score(log_scores, n, percentage=0.1) -> np.ndarray:
    np.random.seed(0)

    lower_bound, upper_bound = min(log_scores), max(log_scores)

    mean, std = np.mean(log_scores), np.std(log_scores)

    sampled_scores = scipy.stats.truncnorm.rvs(
        (lower_bound - mean) / std,
        (upper_bound - mean) / std,
        loc=mean,
        scale=std,
        size=int(percentage * n),
    )

    return sampled_scores


def sample_text_back(texts, log_scores, percentage=0.1, replace=True) -> List[str]:
    """Sample some spam text back in the dataset
    using log score distribution of language model"""

    sampled_scores = sample_score(log_scores, len(texts), percentage)

    sampled_texts = []

    selected_idx = set()

    for samp_score in sampled_scores:
        min_diff, min_idx = float("inf"), -1

        for idx, s in enumerate(log_scores):
            if idx in selected_idx:
                continue

            diff = (samp_score - s) ** 2
            if diff < min_diff:
                min_diff = diff
                min_idx = idx

        sampled_texts.append(texts[min_idx])

        if not replace:
            selected_idx.add(min_idx)

    return sampled_texts


In [None]:
# Compute perplexity 
lm.do(["หนังxจีนมาใหม่ TM0165 แม่เลี้ยงสาวWang Xiaoni หลับอยู่ เจอลูกเลี้ยงชวนเพื่อนมารุมเย็ดแม่เลี้ยง จับมอมยาสลบก่อนลวนลามลงมืดข่มขืนxxx สวิง 3-1 เย็ดจนแตกในซะใจ"])

40.69115244919519

In [None]:
# Classify spam 

classify_spam("หนังxจีนมาใหม่ TM0165 แม่เลี้ยงสาวWang Xiaoni หลับอยู่ เจอลูกเลี้ยงชวนเพื่อนมารุมเย็ดแม่เลี้ยง จับมอมยาสลบก่อนลวนลามลงมืดข่มขืนxxx สวิง 3-1 เย็ดจนแตกในซะใจ")



(array([1]), 40.69115244919519)

In [None]:
# sampled spam texts back 10% since we want model to know some of them
import pandas as pd
df = pd.read_csv("/content/tmp_dataset.csv")

texts = df["text"]
log_scores = df["log_score"]
sampled_texts = sample_text_back(texts,log_scores)
sampled_texts[:5]