# Final Project ASD

In [1]:
import pandas as pd
import numpy as np

### Text Extraction



### Vectorizer


In [2]:
# Vectorizer Alfabetis

alphabetic_list = {
    "a": 0,
    "b": 1,
    "c": 2,
    "d": 3,
    "e": 4,
    "f": 5,
    "g": 6,
    "h": 7,
    "i": 8,
    "j": 9,
    "k": 10,
    "l": 11,
    "m": 12,
    "n": 13,
    "o": 14,
    "p": 15,
    "q": 16,
    "r": 17,
    "s": 18,
    "t": 19,
    "u": 20,
    "v": 21,
    "w": 22,
    "x": 23,
    "y": 24,
    "z": 25,
}

def vectorizer_alfabetis(text_1: str, text_2: str, padding = None, pad_num=99, pad_dim=3) -> list:
    """
    Membuat vektor dari text berdasarkan urutan alfabetis

    Args:
        padding: list yang akan diisi di akhir agar kedua vektor panjangnya sama
        pad_num: elemen padding, semakin besar maka error karena ordinalitas makin kecil
        pad_dim: dimensi padding, semakin besar maka error karena ordinalitas makin kecil

    Returns: 
        list: list hasil vektorisasi

    Raises:
        TypeError: jika input argumen text_1 dan text_2 bukan string
        TypeError: jika input argumen pad_num bukan int
    """

    if type(text_1) != str or type(text_2) != str:
        raise TypeError("text_1 dan text_2 harus string")
    if type(pad_num) != int:
        raise TypeError("pad_num harus integer")

    result_1 = [alphabetic_list[code]+1 for code in text_1]
    result_2 = [alphabetic_list[code]+1 for code in text_2]
    
    if padding:
        if len(result_1) < len(result_2):
            result_1 = result_1 + ([pad_num] * (len(result_2) - len(result_1)))
            result_1 = result_1 + [pad_num] * pad_dim
            result_2 = result_2 + [pad_num] * pad_dim
        elif len(result_2) < len(result_1):
            result_2 = result_2 + ([pad_num] * (len(result_1) - len(result_2)))
            result_1 = result_1 + [pad_num] * pad_dim
            result_2 = result_2 + [pad_num] * pad_dim
        else:
            pass
    
    return result_1, result_2


# Testing
assert vectorizer_alfabetis("ubed", "debuaa", padding=True) == ([21, 2, 5, 4, 99, 99, 99, 99, 99], [4, 5, 2, 21, 1, 1, 99, 99, 99])


In [3]:
# One Hot Vectorizer

# def one_hot_vec(text_1: str, text_2: str, padding = None) -> list:

### Similiarity dan Distance
#### Cosine Similiarity
$$\text{Cosine Similiarity = }\dfrac{A \cdot B}{||A|| \ ||B||}$$

In [4]:
def cosine_similarity(vector_1: list, vector_2: list) -> float:
    """
    Menghitung cosine similarity antara 2 vektor

    Args:
        vector_1: list vektor 1
        vector_2: list vektor 2
    
    Returns:
        float: hasil perhitungan cosine similarity
    
    Raises:
        TypeError: jika input argumen vector_1 dan vector_2 bukan list
        ValueError: jika input argumen vector_1 dan vector_2 memiliki panjang yang berbeda
    """
    if type(vector_1) != list or type(vector_2) != list:
        raise TypeError("vector_1 dan vector_2 harus list")
    if len(vector_1) != len(vector_2):
        raise ValueError("vector_1 dan vector_2 harus panjang sama")
    
    return np.dot(vector_1, vector_2) / (np.linalg.norm(vector_1) * np.linalg.norm(vector_2))

# Testing
assert cosine_similarity([1, 2, 3], [1, 2, 3]) == 1

#### Euclidean Distance

$$\text{Euclidean Distance} = \sqrt{\sum_{i=1}^{n} (y_{i} - x_i)^2}$$

In [5]:
def euclidean_distance(vector_1: list, vector_2: list) -> float:
    """
    Menghitung euclidean distance antara 2 vektor

    Args:
        vector_1: list vektor 1
        vector_2: list vektor 2
    
    Returns:
        float: hasil perhitungan euclidean distance
    """
    return np.linalg.norm(np.array(vector_1) - np.array(vector_2))

assert euclidean_distance([3,4,5], [3,4,5]) == 0

#### Masalah dengan alphabetic vectorizer

In [6]:
text1 = "ubeda"
text2 = "ubedz"
text3 = "lolo"  # Gara-gara padding dengan angka besar, jadinya keitung similiar
vector_1, vector_2 = vectorizer_alfabetis(text1, text2, padding=True)
vector_3, vector_4 = vectorizer_alfabetis(text2, text3, padding=True)
print(f"Cosine similarity antara {text1} dan {text2} adalah {cosine_similarity(vector_1, vector_2)}")
print(f"Euclidean distance antara {text1} dan {text2} adalah {euclidean_distance(vector_1, vector_2)}")
print("\n")
print(f"Cosine similarity antara {text2} dan {text3} adalah {cosine_similarity(vector_3, vector_4)}")
print(f"Euclidean distance antara {text2} dan {text3} adalah {euclidean_distance(vector_3, vector_4)}")

Cosine similarity antara ubeda dan ubedz adalah 0.680616410626047
Euclidean distance antara ubeda dan ubedz adalah 25.0


Cosine similarity antara ubedz dan lolo adalah 0.9266939507419156
Euclidean distance antara ubedz dan lolo adalah 75.82216034906945


### Solusi
#### Levenshtein Ratio

In [2]:
def levenshtein_ratio(s1, s2):
    if len(s1) < len(s2):
        return levenshtein_ratio(s2, s1)
        
    if len(s2) == 0:
        return len(s1)

    previous_row = range(len(s2) + 1)
    for i, c1 in enumerate(s1):
        current_row = [i + 1]
        for j, c2 in enumerate(s2):
            insertions = previous_row[j + 1] + 1
            deletions = current_row[j] + 1       # than s2
            substitutions = previous_row[j] + (c1 != c2)
            current_row.append(min(insertions, deletions, substitutions))
        previous_row = current_row
    return ((max(len(s1), len(s2)) - previous_row[-1]) / max(len(s1), len(s2))) * 100


In [8]:
text1 = "ubeda"
text2 = "ubedz"
text3 = "lolo"


print(
    f"Levenshtein ratio antara {text1} dan {text2} adalah {levenshtein_ratio(text1, text2)} %")
print(
    f"Levenshtein ratio antara {text2} dan {text3} adalah {levenshtein_ratio(text2, text3)} %")


Levenshtein ratio antara ubeda dan ubedz adalah 80.0 %
Levenshtein ratio antara ubedz dan lolo adalah 0.0 %


### Import Kamus

In [3]:
kbbi = open('kbbi.txt', 'rb').read().lower().decode('utf-8').split("\r\n")
kbbi_map = {
    **{word: "dummy_val" for word in kbbi},
}

## Prototype

Optimasi pencarian similiaritas:
1. Menggunakan word_ngram, mencari di kamus yang mengandung kata hasil word_ngram (trigram) ditambah penghilangan kata duplikasi
2. Menggunakan TOLERATED_LEN, mencari kata yang panjangnya dalam range konstanta

Hasil optimasi berhasil mengeliminasi : \
Metode 1     : 72422 kata -> 13613 kata \
Metode 1 + 2 : 72422 kata -> 11607 kata

In [4]:
def word_ngram(text: str):
    unigram = [t for t in text]
    bigram = [list(unigram[i:i+2]) for i in range(len(unigram)-1)]
    trigram = [list(unigram[i:i+3]) for i in range(len(unigram)-2)]

    bigram_join = [''.join(b) for b in bigram]
    trigram_join = [''.join(t) for t in trigram]

    ngram = trigram_join
    return list(set(ngram))

word_ngram("ubed")

['ube', 'bed']

In [6]:
corpus = "saya mau tamasya berkeliling keliling kota hendsk melihat lihat pemsndangan yng ada zzzzzzzzzz".split(" ")

typo_suspect = []
for word in corpus:
    if word not in kbbi_map:
        typo_suspect.append(word)
        
search_key = [word_ngram(gram) for gram in typo_suspect]
search_key

[['end', 'dsk', 'hen', 'nds'],
 ['snd', 'pem', 'ems', 'nga', 'gan', 'dan', 'nda', 'ang', 'msn'],
 ['yng'],
 ['zzz']]

In [7]:
possible_key = []
TOLERATED_LEN = 2
idx = 0
for i in search_key:
    for kb in kbbi_map.keys():
        for j in i:
            if j in kb and (len(kb) <= TOLERATED_LEN + len(typo_suspect[idx])) and (len(kb) >= TOLERATED_LEN - len(typo_suspect[idx])):
                possible_key.append(kb)
    idx += 1
                
possible_key, len(possible_key)

(['adenda',
  'adendum',
  'agenda',
  'apendiks',
  'bendrat',
  'bendo',
  'bendoro',
  'bende',
  'benda',
  'bendok',
  'bendir',
  'berbendi',
  'bendang',
  'blenda',
  'bersendu',
  'bendol',
  'behena',
  'bersendi',
  'bendar',
  'blender',
  'berendam',
  'bendanya',
  'bahenol',
  'benduan',
  'bendalu',
  'bendela',
  'bendi',
  'berbenda',
  'bendala',
  'bertenda',
  'bendara',
  'bendesa',
  'berhenti',
  'bendul',
  'brendi',
  'berenda',
  'bendari',
  'bendawat',
  'bendu',
  'bendawi',
  'bendera',
  'bersenda',
  'bendung',
  'bendel',
  'cendala',
  'cendol',
  'cendera',
  'cenderai',
  'cendana',
  'cendawan',
  'cendang',
  'cheng',
  'cendayam',
  'cecendet',
  'cenduai',
  'cendok',
  'cendekia',
  'defender',
  'dendang',
  'dendaan',
  'dependen',
  'dendi',
  'didenda',
  'dipendam',
  'direndam',
  'dendeng',
  'dendam',
  'denda',
  'endilau',
  'endapan',
  'endam',
  'eigendom',
  'endogami',
  'endevia',
  'endus',
  'endusan',
  'endofit',
  'endemis'

In [8]:
suggestion = {}
for typo in typo_suspect:
    similarity_word = []
    similarity_ratio = []
    for key in possible_key:
        similarity_word.append(key)
        similarity_ratio.append(levenshtein_ratio(typo, key))
    sim_df = pd.DataFrame({"word": similarity_word, "similarity": similarity_ratio})
    sim_df = sim_df.sort_values(by="similarity", ascending=False)

    # if similiarity less than 0.5
    if all(sim_df["similarity"] < 50):
        suggestion[typo] = ["Kata tidak ditemukan di kamus"]
    else:
        suggestion[typo] = [sim_df.iloc[i]["word"] for i in range(3)]
    
    

In [9]:
suggestion

{'hendsk': ['hendak', 'hendak', 'hendak'],
 'pemsndangan': ['pemandangan', 'pemandangan', 'pemandangan'],
 'yng': ['yang', 'nyang', 'eyang'],
 'zzzzzzzzzz': ['Kata tidak ditemukan di kamus']}

## Main Program

In [11]:
corpus_raw = open("Soal ASD FP.txt", "rb").read().lower().decode('utf-8')

In [12]:
import re
def text_cleaning(text: str):
    text = text.replace("\n", " ")
    text = text.replace("\r", " ")
    text = text.replace("\t", " ")
    text = text.replace("  ", " ")
    text = text.lower()
    text = re.sub(r"\[[^\]]*\]", "", text)
    text = re.sub(r'[^\w\s]', '', text)
    text = re.sub(r'\d+', '', text)
    text = re.sub(r"\([^\)]*,[^\)]*\)", "", text)
    
    # remove extra white space
    text = re.sub(r'\s+', ' ', text)
    return text

In [None]:
from tqdm import tqdm

corpus = text_cleaning(corpus_raw).split(" ")
corpus_unique = list(set(corpus))

print("Mencari TYPO SUSPECT...")
typo_suspect = []
for word in tqdm(corpus_unique):
    if word not in kbbi_map:
        typo_suspect.append(word)

print("===TYPO SUSPECT sudah ditemukan===")
search_key = [word_ngram(gram) for gram in typo_suspect]

print("Mencari POSSIBLE KEY...")
possible_key = []
TOLERATED_LEN = 2
idx = 0
for i in tqdm(search_key):
    for kb in kbbi_map.keys():
        for j in i:
            if j in kb and (len(kb) <= TOLERATED_LEN + len(typo_suspect[idx])) and (len(kb) >= TOLERATED_LEN - len(typo_suspect[idx])):
                possible_key.append(kb)
    idx += 1
print("===POSSIBLE KEY sudah ditemukan===")

print("Mencari SUGGESTION...")
suggestion = {}
THRESHOLD = 60
for typo in tqdm(typo_suspect):
    similarity_word = []
    similarity_ratio = []
    for key in possible_key:
        similarity_word.append(key)
        similarity_ratio.append(levenshtein_ratio(typo, key))
    sim_df = pd.DataFrame(
        {"word": similarity_word, "similarity": similarity_ratio})
    sim_df = sim_df.sort_values(by="similarity", ascending=False)

    if all(sim_df["similarity"] < THRESHOLD):
        suggestion[typo] = ["Kata tidak ditemukan di kamus"]
    else:
        suggestion[typo] = [sim_df.iloc[i]["word"] for i in range(3)]

print("===SUGGESTION sudah ditemukan===")

In [69]:
suggestion

{'adaptive': ['adaptif', 'adaptif', 'adaptif'],
 'respon': ['respons', 'respons', 'respons'],
 'cc': ['Kata tidak ditemukan di kamus'],
 'shift': ['shinto', 'shinto', 'shinto'],
 'variance': ['variansi', 'varian', 'varians'],
 'integrated': ['integrator', 'integratif', 'integrator'],
 'bmkg': ['Kata tidak ditemukan di kamus'],
 'iglesias': ['diglosia', 'lasias', 'diglosia'],
 'dianingtyas': ['dibantingnya', 'dibantingnya', 'dibantingnya'],
 'keakurasian': ['keakuratan', 'keakuratan', 'keakuratan'],
 'penghujan': ['penghujat', 'menghujan', 'penghujat'],
 'rbf': ['Kata tidak ditemukan di kamus'],
 'winkler': ['winglet', 'winglet', 'winglet'],
 'bitwise': ['Kata tidak ditemukan di kamus'],
 'operasinya': ['operasional', 'operasional', 'operasional'],
 'lawrence': ['lawrensium', 'lawrensium', 'lawrensium'],
 'function': ['union', 'fungsio', 'union'],
 'mathurakani': ['memburakan', 'majakani', 'matahani'],
 'giovanis': ['galvanis', 'galvanis', 'galvanis'],
 'autoregressive': ['Kata tidak di

## Prototype Dengan Interface

In [27]:
def pipeline_dict(text):
    corpus = text_cleaning(text).split(" ")

    corpus_unique = list(set(corpus))
    typo_suspect = []
    for word in corpus_unique:
        if word not in kbbi_map:
            typo_suspect.append(word)

    search_key = [word_ngram(gram) for gram in typo_suspect]

    possible_key = []


    TOLERATED_LEN = 2
    idx = 0
    for i in search_key:
        for kb in kbbi_map.keys():
            for j in i:
                if j in kb and (len(kb) <= TOLERATED_LEN + len(typo_suspect[idx])) and (len(kb) >= TOLERATED_LEN - len(typo_suspect[idx])):
                    possible_key.append(kb)
        idx += 1

    suggestion = {}
    THRESHOLD = 60
    for typo in typo_suspect:
        similarity_word = []
        similarity_ratio = []
        for key in possible_key:
            similarity_word.append(key)
            similarity_ratio.append(levenshtein_ratio(typo, key))
        sim_df = pd.DataFrame(
            {"word": similarity_word, "similarity": similarity_ratio})
        sim_df = sim_df.sort_values(by="similarity", ascending=False)

        if all(sim_df["similarity"] < THRESHOLD):
            suggestion[typo] = ["Kata tidak ditemukan di kamus"]
        else:
            suggestion[typo] = [
                (sim_df.iloc[i]["word"], sim_df.iloc[i]["similarity"]) for i in range(3)]
    return suggestion

In [10]:
def pipeline(text):
    corpus = text_cleaning(text).split(" ")

    corpus_unique = list(set(corpus))
    typo_suspect = []
    for word in corpus_unique:
        if word not in kbbi_map:
            typo_suspect.append(word)

    search_key = [word_ngram(gram) for gram in typo_suspect]

    possible_key = []


    TOLERATED_LEN = 2
    idx = 0
    for i in search_key:
        for kb in kbbi_map.keys():
            for j in i:
                if j in kb and (len(kb) <= TOLERATED_LEN + len(typo_suspect[idx])) and (len(kb) >= TOLERATED_LEN - len(typo_suspect[idx])):
                    possible_key.append(kb)
        idx += 1

    suggestion = {}
    THRESHOLD = 60
    if len(typo_suspect) == 0:
        return "Tidak ada typo"
        
    for typo in typo_suspect:
        similarity_word = []
        similarity_ratio = []
        for key in possible_key:
            similarity_word.append(key)
            similarity_ratio.append(levenshtein_ratio(typo, key))
        sim_df = pd.DataFrame(
            {"word": similarity_word, "similarity": similarity_ratio})
        sim_df = sim_df.sort_values(by="similarity", ascending=False)

        if all(sim_df["similarity"] < THRESHOLD):
            suggestion[typo] = ["Kata tidak ditemukan di kamus"]
        else:
            suggestion[typo] = [
                (sim_df.iloc[i]["word"], sim_df.iloc[i]["similarity"]) for i in range(3)]
        hasil = ""


        for typo, s in suggestion.items():
            if suggestion[typo] == ["Kata tidak ditemukan di kamus"]:
                hasil += f"{typo} tidak ditemukan di kamus\n"
            else:
                hasil += f"'{typo}' dapat ditulis sebagai:\n"
                idx = 1
                for i in suggestion[typo]:
                    hasil += f"{idx}. {i[0]} ({round(i[1])} %)\n"
                    idx += 1
                hasil += "\n"

    return hasil

In [15]:
suggestion = pipeline("says kots")

In [None]:
import gradio as gr
DESCR = "Prototype typo detector bahasa indonesia untuk projek final pada mata kuliah ASD"
ARTCL = """"""
# Gradio custom css
custom_css = """
@font-face {
  font-family: 'poppins';
  font-style: normal;
  font-weight: 400, 700;
  src: url(https://fonts.gstatic.com/s/poppins/v20/pxiEyp8kv8JHgFVrJJbecmNE.woff2) format('woff2');
  unicode-range: U+0900-097F, U+1CD0-1CF6, U+1CF8-1CF9, U+200C-200D, U+20A8, U+20B9, U+25CC, U+A830-A839, U+A8E0-A8FB;
}
/* latin-ext */
@font-face {
  font-family: 'poppins';
  font-style: normal;
  font-weight: 400, 700;
  src: url(https://fonts.gstatic.com/s/poppins/v20/pxiEyp8kv8JHgFVrJJnecmNE.woff2) format('woff2');
  unicode-range: U+0100-024F, U+0259, U+1E00-1EFF, U+2020, U+20A0-20AB, U+20AD-20CF, U+2113, U+2C60-2C7F, U+A720-A7FF;
}
/* latin */
@font-face {
  font-family: 'poppins';
  font-style: normal;
  font-weight: 400, 900;
  src: url(https://fonts.gstatic.com/s/poppins/v20/pxiEyp8kv8JHgFVrJJfecg.woff2) format('woff2');
  unicode-range: U+0000-00FF, U+0131, U+0152-0153, U+02BB-02BC, U+02C6, U+02DA, U+02DC, U+2000-206F, U+2074, U+20AC, U+2122, U+2191, U+2193, U+2212, U+2215, U+FEFF, U+FFFD;
}
.title {
    font-family: poppins !important;
    font-weight: bold !important;
}
.description {
    font-family: poppins !important;
    font-weight: 400 !important;
    font-size: 14px !important;
}
.article {
    font-family: poppins !important;
    font-weight: 400 !important;
    
}
.panel_button {
    font-weight: 400 !important;
}
.panel_header {
    font-weight: 400 !important;
}
*{
    font-family: poppins !important;
}
"""

demo = gr.Interface(fn=pipeline, inputs="text", outputs="text", examples=["Says mau tamasya. Berkeliling keliling kots", "Ada anak bertsnya pada bapaknys"], examples_per_page=10, live=False, layout="unaligned",
                    theme="dark-peach",css=custom_css, title="TYPO DETECTOR KELOMPOK 5", description=DESCR, article=ARTCL, thumbnail=None, allow_flagging="never")

demo.launch(share=True)