In [1]:
!pip install pdf2image



In [None]:
!conda install -c conda-forge poppler

In [1]:
!pip install openpyxl



In [1]:
import numpy as np
import json
import glob

# Extract image
import pytesseract
import cv2
from PIL import Image

# Gensim
import gensim
import gensim.corpora as corpora
from gensim.utils import simple_preprocess
from gensim.models import CoherenceModel, TfidfModel

# spacy
import spacy
from nltk.corpus import stopwords

# lda visual
import pyLDAvis
import pyLDAvis.gensim

# PDF 
from pdf2image import convert_from_path

# Clean Data
from bs4 import BeautifulSoup # 4.6.3
import re

import warnings
warnings.filterwarnings("ignore", category=DeprecationWarning)

# Step 1, OCR
## Extrak Text From PDF File

In [2]:
# Path ke file PDF
pdf_file = "assets/jurnal.pdf"
result = []

# Fungsi untuk mengekstrak teks dari file PDF dan menampilkan contour
def extract_text_and_show_contours(pdf_path):
    
    # Ubah file PDF menjadi gambar (gambar per halaman)
    
    images = []
    
    try:
        images = convert_pdf_to_images(pdf_path)
    except Exception as e:
        print(f"Error: {e}")

    # Loop melalui setiap gambar (halaman) dan ekstrak teks serta tampilkan contour
    for i, image in enumerate(images):
        text = show_text_contours(image, i)
        print(f"Hasil ekstraksi teks dari halaman {i + 1}")
        
#         show_text_contours(image)

# Fungsi untuk mengubah file PDF menjadi daftar gambar
def convert_pdf_to_images(pdf_path):
    images = []

    # Fungsi untuk mengonversi PDF menjadi gambar
    pdf_images = convert_from_path(pdf_path)

    # Simpan setiap gambar sebagai file gambar terpisah
    for i, image in enumerate(pdf_images):
        images.append(f"convert_to_img/page_{i + 1}.png")
        image.save(f'convert_to_img/page_{i + 1}.png', 'PNG')
                
    return images

# Fungsi untuk mengekstrak teks dari gambar menggunakan pytesseract
def extract_text_from_image(image):

    ocr_result = pytesseract.image_to_string(image)
    ocr_result = ocr_result.split("\n")

    for line in ocr_result:

        result.append(line)
                
    return ocr_result

# Fungsi untuk menampilkan contour dari gambar
def show_text_contours(image, index_img):
    image = cv2.imread(image)
    gray = cv2.cvtColor(image, cv2.COLOR_BGR2GRAY)
    blur = cv2.GaussianBlur(gray, (7,7), 0)
    thresh = cv2.threshold(blur, 0, 255, cv2.THRESH_BINARY_INV + cv2.THRESH_OTSU)[1]
    kernal = cv2.getStructuringElement(cv2.MORPH_RECT, (3, 50))
    dilate = cv2.dilate(thresh, kernal, iterations=1)
    
    cnts = cv2.findContours(dilate, cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE)
    cnts = cnts[0] if len(cnts) == 2 else cnts[1]
    cnts = sorted(cnts, key=lambda x: cv2.boundingRect(x)[1])
    
    if_agains = 0
        
    for c in cnts:
        x, y, w, h = cv2.boundingRect(c)

        # if statement berikut untuk mengambil bagian isi dari image, jika tidak maka semua kata akan di contours

        if h > 200 and w > 250:
            roi = image[y:y+h, x:x+w]
            
            if if_agains > 0:
                cv2.imwrite(f"temp/jurnal_countor_{index_img}_{if_agains}.png", roi)
                cv2.rectangle(image, (x, y), (x+w, y+h), (36, 255, 12), 2)
            else:
                cv2.imwrite(f"temp/jurnal_countor_{index_img}.png", roi)
                cv2.rectangle(image, (x, y), (x+w, y+h), (36, 255, 12), 2)
            
            extract_text_from_image(roi)
            if_agains += 1
                
    cv2.imwrite(f"boxes/sample_boxes_{index_img}.png", image)


In [None]:
extract_text_and_show_contours(pdf_file)

In [None]:
result

# Step 2, Clean Data

In [None]:
def clean_data(data):
    data = BeautifulSoup(data, "lxml").get_text()
    
    # Delete the @
    data = re.sub(r"@[A-Za-z0-9]+", ' ', data)
    
    # Delete URL links
    data = re.sub(r"https?://[A-Za-z0-9./]+", ' ', data)
    
    # Just keep letters and important punctuation
    data = re.sub(r"[^a-zA-Z.!?']", ' ', data)
    
    # Just keep letters and important punctuation
    # data = re.sub('(?i)[^0-9a-z.!?]', '', data)
    
#     data = re.sub(r'[.,;]', '', data)
    
    # Remove additional spaces
    data = re.sub(r" +", ' ', data)
    
    # Remove spaces at the beginning and end of text
    data = data.strip()
    
    return data

In [None]:
data_clean = []


for item in result:
    item = item.strip()
    data = clean_data(item)
        
    if len(data) > 4:
        data_clean.append(data)

print(data_clean)

# Step 3, Preprocessing

In [None]:
# allowed_postags=["NOUN", "ADJ", "VERB", "ADV", "PROPN", "WORK_OF_ART", "ORG", "GPE"]
def lemmatization(texts, allowed_postags=["NOUN", "ADJ", "VERB", "ADV", "PROPN"]):
    
    nlp = spacy.load("en_core_web_sm", disable=["parser", "ner"])
    texts_out = []
    
    for text in texts:
        doc = nlp(text)
        new_text = []
        
        for token in doc:
            if token.pos_ in allowed_postags:
                new_text.append(token.lemma_)
                
        final = " ".join(new_text)
        texts_out.append(final)
        
    return (texts_out)


lemmatized_texts = lemmatization(data_clean)
lemmatized_texts

In [None]:
def gen_words(texts):
    final = []
    
    for text in texts:
        new = gensim.utils.simple_preprocess(text, deacc=True)
        final.append(new)
        
    return (final)

data_words = gen_words(lemmatized_texts)
data_words

In [None]:
#BIGRAMS AND TRIGRAMS
bigram_phrases = gensim.models.Phrases(data_words, min_count=5, threshold=100)
trigram_phrases = gensim.models.Phrases(bigram_phrases[data_words], threshold=100)

bigram = gensim.models.phrases.Phraser(bigram_phrases)
trigram = gensim.models.phrases.Phraser(trigram_phrases)

def make_bigrams(texts):
    return([bigram[doc] for doc in texts])

def make_trigrams(texts):
    return ([trigram[bigram[doc]] for doc in texts])

data_bigrams = make_bigrams(data_words)
data_bigrams_trigrams = make_trigrams(data_bigrams)

data_bigrams_trigrams

# Step 3, Build Model

In [None]:
#TF-IDF REMOVAL

id2word = corpora.Dictionary(data_bigrams_trigrams)

texts = data_bigrams_trigrams

corpus = [id2word.doc2bow(text) for text in texts]
# print (corpus[0][0:20])

tfidf = TfidfModel(corpus, id2word=id2word)

low_value = 0.03
words  = []
words_missing_in_tfidf = []

for i in range(0, len(corpus)):
    bow = corpus[i]
    low_value_words = [] #reinitialize to be safe. You can skip this.
    tfidf_ids = [id for id, value in tfidf[bow]]
    bow_ids = [id for id, value in bow]
    low_value_words = [id for id, value in tfidf[bow] if value < low_value]
    drops = low_value_words+words_missing_in_tfidf
    for item in drops:
        words.append(id2word[item])
    words_missing_in_tfidf = [id for id in bow_ids if id not in tfidf_ids] # The words with tf-idf socre 0 will be missing

    new_bow = [b for b in bow if b[0] not in low_value_words and b[0] not in words_missing_in_tfidf]
    corpus[i] = new_bow

In [None]:
lda_model = gensim.models.ldamodel.LdaModel(corpus=corpus[:-1],
                                           id2word=id2word,
                                           num_topics=10,
                                           random_state=100,
                                           update_every=1,
                                           chunksize=100,
                                           passes=10,
                                           alpha="auto")

In [None]:
test_doc = corpus[-1]

vector = lda_model[test_doc]

def Sort(sub_li):
    sub_li.sort(key = lambda x: x[1])
    sub_li.reverse()
    return (sub_li)

new_vector = Sort(vector)

print (new_vector)

In [None]:
lda_model.save("models/test_model.model")

In [None]:
new_model = gensim.models.ldamodel.LdaModel.load("models/test_model.model")

In [None]:
test_doc = corpus[-1]

vector = new_model[test_doc]

def Sort(sub_li):
    sub_li.sort(key = lambda x: x[1])
    sub_li.reverse()
    return (sub_li)

new_vector = Sort(vector)
print (new_vector)

# Step 4, Visual the Data

In [None]:
pyLDAvis.enable_notebook()
vis = pyLDAvis.gensim.prepare(lda_model, corpus, id2word, mds="mmds", R=30)
vis

In [None]:
# Simpan koordinat titik visualisasi

data_to_save = {
    "topic_coordinates": vis.topic_coordinates.to_dict(orient="split"),
    "topic_info": vis.topic_info.to_dict(orient="split"),
    "token_table": vis.token_table.to_dict(orient="split"),
    "topic_order": vis.topic_order
}

# Simpan data ke dalam file JSON

with open("outputs/visualisasi_lda.json", "w") as json_file:
    json.dump(data_to_save, json_file)