In [87]:
import pandas as pd
import gensim
from gensim.models import Word2Vec
from nltk.tokenize import word_tokenize
import nltk
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np
import random

pd.set_option('display.max_columns', None)
pd.set_option('display.max_colwidth', None)

nltk.download('punkt')

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\whisn\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

# 1. Read KBBI

In [88]:
file_path = './kbbi.csv'
df = pd.read_csv(file_path, header=None)
df.head()

Unnamed: 0,0
0,a
1,ab
2,aba
3,aba-aba
4,abad


In [89]:
df = pd.read_csv(file_path, header=None)
words = df[0].tolist()
print(words[:5])

['a', 'ab', 'aba', 'aba-aba', 'abad']


In [90]:
words = [str(word) for word in words if isinstance(word, str)]

# 2. Tokenization

In [91]:
sentences = [[word] for word in words]
print(sentences[:5])

[['a'], ['ab'], ['aba'], ['aba-aba'], ['abad']]


In [92]:
print(len(sentences))

30341


# 3. Modelling

In [93]:
model = Word2Vec(sentences, vector_size=100, window=5, min_count=1, sg=1)
model

<gensim.models.word2vec.Word2Vec at 0x2070cf74d60>

# 3. Test

In [131]:
sample_words = ['bagaimana', 'makan', 'minum']
for word in sample_words:
    try:
        similar_words = model.wv.most_similar(word, topn=5)
        print(f"Kata-kata yang mirip dengan '{word}': {similar_words}")
    except KeyError:
        print(f"Kata '{word}' tidak ditemukan dalam model")

# Langkah 5: Pengecekan Ejaan dengan Kesamaan Kosinus
def correct_spelling(word, model, top_n=5):
    try:
        similar_words = model.wv.most_similar(word, topn=top_n)
        return similar_words
    except KeyError:
        return [(None, 0)]

# Contoh penggunaan
misspelled_word = 'bagakmana'
corrections = correct_spelling(misspelled_word, model)
best_correction = corrections[0][0] if corrections[0][0] is not None else "Tidak ditemukan koreksi"

print(f"Koreksi untuk '{misspelled_word}': {best_correction}")

# Menampilkan beberapa hasil teratas
print(f"Beberapa saran untuk '{misspelled_word}': {corrections}")

Kata-kata yang mirip dengan 'bagaimana': [('prematur', 0.3819778859615326), ('kenyut', 0.3796915113925934), ('hidrolika', 0.35573849081993103), ('ve', 0.3541423976421356), ('penis', 0.34888318181037903)]
Kata-kata yang mirip dengan 'makan': [('diamagnetisme', 0.3716622292995453), ('dongbret', 0.36857935786247253), ('realistis', 0.36793580651283264), ('ridan', 0.3474102318286896), ('meunasah', 0.34664663672447205)]
Kata-kata yang mirip dengan 'minum': [('hidrolisis', 0.41755184531211853), ('mazmumah', 0.3732891082763672), ('sarhad', 0.36761611700057983), ('dinul-islam', 0.35484209656715393), ('jasmaniah', 0.3514745235443115)]
Koreksi untuk 'bagakmana': Tidak ditemukan koreksi
Beberapa saran untuk 'bagakmana': [(None, 0)]


In [95]:
from difflib import get_close_matches

In [12]:
import pandas as pd
from difflib import get_close_matches

# Fungsi untuk menghasilkan daftar kata dari file CSV dengan filter panjang kata
def generate_word_spelling(file_path, del_thresh):
    df = pd.read_csv(file_path, header=None)
    filtered_df = df[df[0].str.len() >= del_thresh]
    words = filtered_df[0].tolist()
    return words

# Fungsi Koreksi Ejaan
def correct_spelling(word, word_list):
    # Prioritas 1: Panjang karakter yang sama
    same_length_words = [w for w in word_list if len(w) == len(word)]
    close_matches = get_close_matches(word, same_length_words, n=1, cutoff=0.8)
    
    if close_matches:
        return close_matches[0]
    
    # Prioritas 2: Ganti karakter dari belakang ke depan
    for i in range(len(word)-1, -1, -1):
        for char in 'abcdefghijklmnopqrstuvwxyz':
            if char != word[i]:  # Hanya mengganti jika karakter berbeda
                possible_word = word[:i] + char + word[i+1:]
                if possible_word in word_list:
                    return possible_word
    
    # Prioritas 3: Tambahkan karakter hanya di akhir jika tidak ada kecocokan
    for char in 'abcdefghijklmnopqrstuvwxyz':
        possible_word = word + char
        if possible_word in word_list:
            return possible_word
    
    return None

# Contoh penggunaan
file_path = './10k-indonesia-common-words.csv'  # Sesuaikan dengan path file Anda
# file_path = './kbbi.csv'  # Sesuaikan dengan path file Anda
del_thresh = 3  # Panjang minimal kata yang diizinkan
list_word = generate_word_spelling(file_path, del_thresh)

# Coba koreksi ejaan
misspelled_word = 'bagaimans'
corrected_word = correct_spelling(misspelled_word, list_word)
print(f"Koreksi untuk '{misspelled_word}': {corrected_word}")


Koreksi untuk 'bagaimans': bagaimana


In [1]:
import pandas as pd
from langdetect import detect, DetectorFactory
from langdetect.lang_detect_exception import LangDetectException

# Set seed for reproducibility
DetectorFactory.seed = 0

def is_indonesian(word):
    try:
        return detect(word) == 'id'
    except LangDetectException:
        return False

def filter_indonesian_words(word_list):
    return [word for word in word_list if is_indonesian(word)]

# Baca daftar kata dari file
input_file = './10k-indonesia-common-words.csv'
output_file = './cleaned-10k.csv'

with open(input_file, 'r') as file:
    word_list = file.read().splitlines()

# Filter kata-kata Bahasa Indonesia
filtered_words = filter_indonesian_words(word_list)

# Buat DataFrame dari kata-kata yang sudah difilter
df = pd.DataFrame(filtered_words, columns=['Kata'])

# Simpan ke file CSV
df.to_csv(output_file, index=False)

print(f"Filtered words have been saved to {output_file}")


Filtered words have been saved to ./cleaned-10k.csv
