# Scientia-Model


## Import Libraries and Configuration


In [85]:
from marshal import dumps, loads
from langdetect import detect
from langdetect.lang_detect_exception import LangDetectException
from mpstemmer import MPStemmer
from numpy import argsort, inf, set_printoptions
from polars import col, read_csv, Series
from re import sub
from Sastrawi.StopWordRemover.StopWordRemoverFactory import StopWordRemoverFactory
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from types import FunctionType

file_path_dataset = "temp/repository_pnj_20212023.csv"
file_path_select_feature = "temp/01_select_feature.csv"
file_path_lowercase = "temp/02_lowercase.csv"
file_path_remove_empty_abstract = "temp/03_remove_empty_abstract.csv"
file_path_remove_dash_abstract = "temp/04_remove_dash_abstract.csv"
file_path_remove_same_title_abstract = "temp/05_remove_same_title_abstract.csv"
file_path_merge = "temp/06_merge.csv"
file_path_remove_non_alphabet = "temp/07_remove_non_alphabet.csv"
file_path_unique = "temp/08_unique.csv"
file_path_add_lang = "temp/09_add_lang.csv"
file_path_filter_lang = "temp/10_filter_lang.csv"
file_path_stem = "temp/11_stem.csv"
file_path_remove_stopwords = "temp/12_remove_stopwords.csv"
file_path_dataset_clean = "temp/repository_pnj_20212023clean.csv"
file_path_model = "temp/model.marshall"

set_printoptions(threshold=inf)


def print_file_table(output):
    print(f"./{output}", read_csv(output))

## Preprocessing


### 1. Select Features


In [16]:
read_csv(file_path_dataset).columns

['url',
 'title',
 'abstract',
 'document_type',
 'subject',
 'unit_field',
 'user_id',
 'date_deposited',
 'last_modified']

In [15]:
input = file_path_dataset
output = file_path_select_feature

read_csv(input).with_columns(
    [col("title").alias("f1"), col("abstract").alias("f2")]
).write_csv(output)

print_file_table(output)

./temp/01_select_feature.csv shape: (6_295, 11)
┌───────────┬───────────┬───────────┬───────────┬───┬───────────┬───────────┬───────────┬──────────┐
│ url       ┆ title     ┆ abstract  ┆ document_ ┆ … ┆ date_depo ┆ last_modi ┆ f1        ┆ f2       │
│ ---       ┆ ---       ┆ ---       ┆ type      ┆   ┆ sited     ┆ fied      ┆ ---       ┆ ---      │
│ str       ┆ str       ┆ str       ┆ ---       ┆   ┆ ---       ┆ ---       ┆ str       ┆ str      │
│           ┆           ┆           ┆ str       ┆   ┆ str       ┆ str       ┆           ┆          │
╞═══════════╪═══════════╪═══════════╪═══════════╪═══╪═══════════╪═══════════╪═══════════╪══════════╡
│ https://r ┆ Pengelola ┆           ┆ Thesis /  ┆ … ┆ 02 Sep    ┆ 02 Sep    ┆ Pengelola ┆          │
│ epository ┆ an        ┆           ┆ Skripsi / ┆   ┆ 2021      ┆ 2021      ┆ an        ┆          │
│ .pnj.ac.i ┆ Dokumen   ┆           ┆ Tugas     ┆   ┆ 09:05     ┆ 09:07     ┆ Dokumen   ┆          │
│ d/i…      ┆ Bongkar   ┆           ┆ Akhir

### 2. Lowercase


In [14]:
input = file_path_select_feature
output = file_path_lowercase

read_csv(input).with_columns(
    [
        col("f1").str.to_lowercase(),
        col("f2").str.to_lowercase(),
    ]
).write_csv(output)

print_file_table(output)

./temp/02_lowercase.csv shape: (6_295, 11)
┌───────────┬───────────┬───────────┬───────────┬───┬───────────┬───────────┬───────────┬──────────┐
│ url       ┆ title     ┆ abstract  ┆ document_ ┆ … ┆ date_depo ┆ last_modi ┆ f1        ┆ f2       │
│ ---       ┆ ---       ┆ ---       ┆ type      ┆   ┆ sited     ┆ fied      ┆ ---       ┆ ---      │
│ str       ┆ str       ┆ str       ┆ ---       ┆   ┆ ---       ┆ ---       ┆ str       ┆ str      │
│           ┆           ┆           ┆ str       ┆   ┆ str       ┆ str       ┆           ┆          │
╞═══════════╪═══════════╪═══════════╪═══════════╪═══╪═══════════╪═══════════╪═══════════╪══════════╡
│ https://r ┆ Pengelola ┆           ┆ Thesis /  ┆ … ┆ 02 Sep    ┆ 02 Sep    ┆ pengelola ┆          │
│ epository ┆ an        ┆           ┆ Skripsi / ┆   ┆ 2021      ┆ 2021      ┆ an        ┆          │
│ .pnj.ac.i ┆ Dokumen   ┆           ┆ Tugas     ┆   ┆ 09:05     ┆ 09:07     ┆ dokumen   ┆          │
│ d/i…      ┆ Bongkar   ┆           ┆ Akhir…    

### 3. Remove Missing Values


#### 3.1. Remove Empty Abstract


In [13]:
input = file_path_lowercase
output = file_path_remove_empty_abstract

read_csv(input).with_columns(col("f2").replace("", None)).drop_nulls().write_csv(output)

print_file_table(output)

./temp/03_remove_empty_abstract.csv shape: (5_047, 11)
┌───────────┬───────────┬───────────┬───────────┬───┬───────────┬───────────┬───────────┬──────────┐
│ url       ┆ title     ┆ abstract  ┆ document_ ┆ … ┆ date_depo ┆ last_modi ┆ f1        ┆ f2       │
│ ---       ┆ ---       ┆ ---       ┆ type      ┆   ┆ sited     ┆ fied      ┆ ---       ┆ ---      │
│ str       ┆ str       ┆ str       ┆ ---       ┆   ┆ ---       ┆ ---       ┆ str       ┆ str      │
│           ┆           ┆           ┆ str       ┆   ┆ str       ┆ str       ┆           ┆          │
╞═══════════╪═══════════╪═══════════╪═══════════╪═══╪═══════════╪═══════════╪═══════════╪══════════╡
│ https://r ┆ Similiter ┆ -         ┆ Dokumen   ┆ … ┆ 11 Mar    ┆ 11 Mar    ┆ similiter ┆ -        │
│ epository ┆ y_Dominan ┆           ┆ Internal  ┆   ┆ 2022      ┆ 2022      ┆ y_dominan ┆          │
│ .pnj.ac.i ┆ ts’       ┆           ┆ (Lainnya) ┆   ┆ 04:11     ┆ 04:11     ┆ ts’       ┆          │
│ d/i…      ┆ Factors … ┆           

#### 3.2. Remove Dash Abstract


In [12]:
input = file_path_remove_empty_abstract
output = file_path_remove_dash_abstract

read_csv(input).with_columns(col("f2").replace("-", None)).drop_nulls().write_csv(
    output
)

print_file_table(output)

./temp/04_remove_dash_abstract.csv shape: (5_042, 11)
┌───────────┬───────────┬───────────┬───────────┬───┬───────────┬───────────┬───────────┬──────────┐
│ url       ┆ title     ┆ abstract  ┆ document_ ┆ … ┆ date_depo ┆ last_modi ┆ f1        ┆ f2       │
│ ---       ┆ ---       ┆ ---       ┆ type      ┆   ┆ sited     ┆ fied      ┆ ---       ┆ ---      │
│ str       ┆ str       ┆ str       ┆ ---       ┆   ┆ ---       ┆ ---       ┆ str       ┆ str      │
│           ┆           ┆           ┆ str       ┆   ┆ str       ┆ str       ┆           ┆          │
╞═══════════╪═══════════╪═══════════╪═══════════╪═══╪═══════════╪═══════════╪═══════════╪══════════╡
│ https://r ┆ Proses    ┆ Dalam men ┆ Thesis /  ┆ … ┆ 03 Sep    ┆ 03 Sep    ┆ proses    ┆ dalam    │
│ epository ┆ Pembuatan ┆ jalankan  ┆ Skripsi / ┆   ┆ 2022      ┆ 2022      ┆ pembuatan ┆ menjalan │
│ .pnj.ac.i ┆ Surat     ┆ sebuah    ┆ Tugas     ┆   ┆ 16:23     ┆ 16:23     ┆ surat     ┆ kan      │
│ d/i…      ┆ Jalan (…  ┆ bisni…    ┆

#### 3.3. Remove Same Title Abstract


In [11]:
input = file_path_remove_dash_abstract
output = file_path_remove_same_title_abstract

read_csv(input).filter(col("f1") != col("f2")).write_csv(output)

print_file_table(output)

./temp/05_remove_same_title_abstract.csv shape: (5_034, 11)
┌───────────┬───────────┬───────────┬───────────┬───┬───────────┬───────────┬───────────┬──────────┐
│ url       ┆ title     ┆ abstract  ┆ document_ ┆ … ┆ date_depo ┆ last_modi ┆ f1        ┆ f2       │
│ ---       ┆ ---       ┆ ---       ┆ type      ┆   ┆ sited     ┆ fied      ┆ ---       ┆ ---      │
│ str       ┆ str       ┆ str       ┆ ---       ┆   ┆ ---       ┆ ---       ┆ str       ┆ str      │
│           ┆           ┆           ┆ str       ┆   ┆ str       ┆ str       ┆           ┆          │
╞═══════════╪═══════════╪═══════════╪═══════════╪═══╪═══════════╪═══════════╪═══════════╪══════════╡
│ https://r ┆ Proses    ┆ Dalam men ┆ Thesis /  ┆ … ┆ 03 Sep    ┆ 03 Sep    ┆ proses    ┆ dalam    │
│ epository ┆ Pembuatan ┆ jalankan  ┆ Skripsi / ┆   ┆ 2022      ┆ 2022      ┆ pembuatan ┆ menjalan │
│ .pnj.ac.i ┆ Surat     ┆ sebuah    ┆ Tugas     ┆   ┆ 16:23     ┆ 16:23     ┆ surat     ┆ kan      │
│ d/i…      ┆ Jalan (…  ┆ bisni

### 4. Merge Title Abstract


In [10]:
input = file_path_remove_same_title_abstract
output = file_path_merge

read_csv(input).with_columns((col("f1") + " " + col("f2")).alias("f")).drop(
    ["f1", "f2"]
).write_csv(output)

print_file_table(output)

./temp/06_merge.csv shape: (5_034, 10)
┌───────────┬───────────┬───────────┬───────────┬───┬───────────┬───────────┬───────────┬──────────┐
│ url       ┆ title     ┆ abstract  ┆ document_ ┆ … ┆ user_id   ┆ date_depo ┆ last_modi ┆ f        │
│ ---       ┆ ---       ┆ ---       ┆ type      ┆   ┆ ---       ┆ sited     ┆ fied      ┆ ---      │
│ str       ┆ str       ┆ str       ┆ ---       ┆   ┆ str       ┆ ---       ┆ ---       ┆ str      │
│           ┆           ┆           ┆ str       ┆   ┆           ┆ str       ┆ str       ┆          │
╞═══════════╪═══════════╪═══════════╪═══════════╪═══╪═══════════╪═══════════╪═══════════╪══════════╡
│ https://r ┆ Proses    ┆ Dalam men ┆ Thesis /  ┆ … ┆ Adela     ┆ 03 Sep    ┆ 03 Sep    ┆ proses   │
│ epository ┆ Pembuatan ┆ jalankan  ┆ Skripsi / ┆   ┆ Piqih     ┆ 2022      ┆ 2022      ┆ pembuata │
│ .pnj.ac.i ┆ Surat     ┆ sebuah    ┆ Tugas     ┆   ┆           ┆ 16:23     ┆ 16:23     ┆ n surat  │
│ d/i…      ┆ Jalan (…  ┆ bisni…    ┆ Akhir…    ┆   

### 5. Remove Non Alphabetical Characters


In [9]:
input = file_path_merge
output = file_path_remove_non_alphabet

read_csv(input).with_columns(col("f").str.replace_all(r"[^a-z]+", " ")).write_csv(
    output
)

print_file_table(output)

./temp/07_remove_non_alphabet.csv shape: (5_034, 10)
┌───────────┬───────────┬───────────┬───────────┬───┬───────────┬───────────┬───────────┬──────────┐
│ url       ┆ title     ┆ abstract  ┆ document_ ┆ … ┆ user_id   ┆ date_depo ┆ last_modi ┆ f        │
│ ---       ┆ ---       ┆ ---       ┆ type      ┆   ┆ ---       ┆ sited     ┆ fied      ┆ ---      │
│ str       ┆ str       ┆ str       ┆ ---       ┆   ┆ str       ┆ ---       ┆ ---       ┆ str      │
│           ┆           ┆           ┆ str       ┆   ┆           ┆ str       ┆ str       ┆          │
╞═══════════╪═══════════╪═══════════╪═══════════╪═══╪═══════════╪═══════════╪═══════════╪══════════╡
│ https://r ┆ Proses    ┆ Dalam men ┆ Thesis /  ┆ … ┆ Adela     ┆ 03 Sep    ┆ 03 Sep    ┆ proses   │
│ epository ┆ Pembuatan ┆ jalankan  ┆ Skripsi / ┆   ┆ Piqih     ┆ 2022      ┆ 2022      ┆ pembuata │
│ .pnj.ac.i ┆ Surat     ┆ sebuah    ┆ Tugas     ┆   ┆           ┆ 16:23     ┆ 16:23     ┆ n surat  │
│ d/i…      ┆ Jalan (…  ┆ bisni…    ┆ 

### 6. Remove Duplicate


In [6]:
input = file_path_remove_non_alphabet
output = file_path_unique

read_csv(input).unique("f").write_csv(output)

print_file_table(output)

./temp/08_unique.csv shape: (4_832, 10)
┌───────────┬───────────┬───────────┬───────────┬───┬───────────┬───────────┬───────────┬──────────┐
│ url       ┆ title     ┆ abstract  ┆ document_ ┆ … ┆ user_id   ┆ date_depo ┆ last_modi ┆ f        │
│ ---       ┆ ---       ┆ ---       ┆ type      ┆   ┆ ---       ┆ sited     ┆ fied      ┆ ---      │
│ str       ┆ str       ┆ str       ┆ ---       ┆   ┆ str       ┆ ---       ┆ ---       ┆ str      │
│           ┆           ┆           ┆ str       ┆   ┆           ┆ str       ┆ str       ┆          │
╞═══════════╪═══════════╪═══════════╪═══════════╪═══╪═══════════╪═══════════╪═══════════╪══════════╡
│ https://r ┆ ANALYSIS  ┆ This      ┆ Thesis /  ┆ … ┆ Arizal    ┆ 01 Jun    ┆ 01 Jun    ┆ analysis │
│ epository ┆ OF THE    ┆ study is  ┆ Skripsi / ┆   ┆ Dwi       ┆ 2023      ┆ 2023      ┆ of the   │
│ .pnj.ac.i ┆ POSTURE   ┆ about     ┆ Tugas     ┆   ┆ Kurniawan ┆ 01:14     ┆ 01:14     ┆ posture  │
│ d/i…      ┆ OF AN …   ┆ analyzing ┆ Akhir…    ┆  

### 7. Filter Based on Language


#### 7.1. Add Column Language


In [7]:
input = file_path_unique
output = file_path_add_lang


def detect_language(text):
    try:
        return detect(text)
    except LangDetectException:
        return "unknown"


read_csv(input).with_columns(
    col("f").map_elements(detect_language, return_dtype=str).alias("lang")
).write_csv(output)

print_file_table(output)

./temp/09_add_lang.csv shape: (4_832, 11)
┌────────────┬────────────┬────────────┬────────────┬───┬───────────┬───────────┬───────────┬──────┐
│ url        ┆ title      ┆ abstract   ┆ document_t ┆ … ┆ date_depo ┆ last_modi ┆ f         ┆ lang │
│ ---        ┆ ---        ┆ ---        ┆ ype        ┆   ┆ sited     ┆ fied      ┆ ---       ┆ ---  │
│ str        ┆ str        ┆ str        ┆ ---        ┆   ┆ ---       ┆ ---       ┆ str       ┆ str  │
│            ┆            ┆            ┆ str        ┆   ┆ str       ┆ str       ┆           ┆      │
╞════════════╪════════════╪════════════╪════════════╪═══╪═══════════╪═══════════╪═══════════╪══════╡
│ https://re ┆ ANALYSIS   ┆ This study ┆ Thesis /   ┆ … ┆ 01 Jun    ┆ 01 Jun    ┆ analysis  ┆ en   │
│ pository.p ┆ OF THE     ┆ is about   ┆ Skripsi /  ┆   ┆ 2023      ┆ 2023      ┆ of the    ┆      │
│ nj.ac.id/i ┆ POSTURE OF ┆ analyzing  ┆ Tugas      ┆   ┆ 01:14     ┆ 01:14     ┆ posture   ┆      │
│ …          ┆ AN …       ┆ …          ┆ Akhir…  

#### 7.2. Remove Non-"Bahasa Indonesia"


In [8]:
input = file_path_add_lang
output = file_path_filter_lang

read_csv(input).filter(col("lang") == "id").drop("lang").write_csv(output)

print_file_table(output)

./temp/10_filter_lang.csv shape: (4_712, 10)
┌───────────┬───────────┬───────────┬───────────┬───┬───────────┬───────────┬───────────┬──────────┐
│ url       ┆ title     ┆ abstract  ┆ document_ ┆ … ┆ user_id   ┆ date_depo ┆ last_modi ┆ f        │
│ ---       ┆ ---       ┆ ---       ┆ type      ┆   ┆ ---       ┆ sited     ┆ fied      ┆ ---      │
│ str       ┆ str       ┆ str       ┆ ---       ┆   ┆ str       ┆ ---       ┆ ---       ┆ str      │
│           ┆           ┆           ┆ str       ┆   ┆           ┆ str       ┆ str       ┆          │
╞═══════════╪═══════════╪═══════════╪═══════════╪═══╪═══════════╪═══════════╪═══════════╪══════════╡
│ https://r ┆ ANALISIS  ┆ Proyek    ┆ Thesis /  ┆ … ┆ Raditya   ┆ 04 Aug    ┆ 04 Aug    ┆ analisis │
│ epository ┆ KOMPARATI ┆ konstruks ┆ Skripsi / ┆   ┆ Ranugra   ┆ 2022      ┆ 2022      ┆ komparat │
│ .pnj.ac.i ┆ F WASTE   ┆ i banyak  ┆ Tugas     ┆   ┆ Mahesa    ┆ 04:15     ┆ 04:15     ┆ if waste │
│ d/i…      ┆ DAN …     ┆ mengh…    ┆ Akhir…  

### 8. Stemming


In [86]:
input = file_path_filter_lang
output = file_path_stem


def stem_text(text):
    return MPStemmer().stem_kalimat(text)


read_csv(input).with_columns(
    col("f").map_elements(stem_text, return_dtype=str)
).write_csv(output)

print_file_table(output)

./temp/11_stem.csv shape: (4_712, 10)
┌───────────┬───────────┬───────────┬───────────┬───┬───────────┬───────────┬───────────┬──────────┐
│ url       ┆ title     ┆ abstract  ┆ document_ ┆ … ┆ user_id   ┆ date_depo ┆ last_modi ┆ f        │
│ ---       ┆ ---       ┆ ---       ┆ type      ┆   ┆ ---       ┆ sited     ┆ fied      ┆ ---      │
│ str       ┆ str       ┆ str       ┆ ---       ┆   ┆ str       ┆ ---       ┆ ---       ┆ str      │
│           ┆           ┆           ┆ str       ┆   ┆           ┆ str       ┆ str       ┆          │
╞═══════════╪═══════════╪═══════════╪═══════════╪═══╪═══════════╪═══════════╪═══════════╪══════════╡
│ https://r ┆ ANALISIS  ┆ Proyek    ┆ Thesis /  ┆ … ┆ Raditya   ┆ 04 Aug    ┆ 04 Aug    ┆ analisis │
│ epository ┆ KOMPARATI ┆ konstruks ┆ Skripsi / ┆   ┆ Ranugra   ┆ 2022      ┆ 2022      ┆ komparat │
│ .pnj.ac.i ┆ F WASTE   ┆ i banyak  ┆ Tugas     ┆   ┆ Mahesa    ┆ 04:15     ┆ 04:15     ┆ if waste │
│ d/i…      ┆ DAN …     ┆ mengh…    ┆ Akhir…    ┆   ┆

### 9. Remove Stopwords


In [87]:
input = file_path_stem
output = file_path_remove_stopwords


def remove_stopwords(text):
    return StopWordRemoverFactory().create_stop_word_remover().remove(text)


read_csv(input).with_columns(
    col("f").map_elements(remove_stopwords, return_dtype=str)
).write_csv(output)

print_file_table(output)

./temp/12_remove_stopwords.csv shape: (4_712, 10)
┌───────────┬───────────┬───────────┬───────────┬───┬───────────┬───────────┬───────────┬──────────┐
│ url       ┆ title     ┆ abstract  ┆ document_ ┆ … ┆ user_id   ┆ date_depo ┆ last_modi ┆ f        │
│ ---       ┆ ---       ┆ ---       ┆ type      ┆   ┆ ---       ┆ sited     ┆ fied      ┆ ---      │
│ str       ┆ str       ┆ str       ┆ ---       ┆   ┆ str       ┆ ---       ┆ ---       ┆ str      │
│           ┆           ┆           ┆ str       ┆   ┆           ┆ str       ┆ str       ┆          │
╞═══════════╪═══════════╪═══════════╪═══════════╪═══╪═══════════╪═══════════╪═══════════╪══════════╡
│ https://r ┆ ANALISIS  ┆ Proyek    ┆ Thesis /  ┆ … ┆ Raditya   ┆ 04 Aug    ┆ 04 Aug    ┆ analisis │
│ epository ┆ KOMPARATI ┆ konstruks ┆ Skripsi / ┆   ┆ Ranugra   ┆ 2022      ┆ 2022      ┆ komparat │
│ .pnj.ac.i ┆ F WASTE   ┆ i banyak  ┆ Tugas     ┆   ┆ Mahesa    ┆ 04:15     ┆ 04:15     ┆ if waste │
│ d/i…      ┆ DAN …     ┆ mengh…    ┆ Akh

### 10. Sort


In [88]:
input = file_path_remove_stopwords
output = file_path_dataset_clean

read_csv(input).sort("url").write_csv(output)

print_file_table(output)

./temp/repository_pnj_20212023clean.csv shape: (4_712, 10)
┌───────────┬───────────┬───────────┬───────────┬───┬───────────┬───────────┬───────────┬──────────┐
│ url       ┆ title     ┆ abstract  ┆ document_ ┆ … ┆ user_id   ┆ date_depo ┆ last_modi ┆ f        │
│ ---       ┆ ---       ┆ ---       ┆ type      ┆   ┆ ---       ┆ sited     ┆ fied      ┆ ---      │
│ str       ┆ str       ┆ str       ┆ ---       ┆   ┆ str       ┆ ---       ┆ ---       ┆ str      │
│           ┆           ┆           ┆ str       ┆   ┆           ┆ str       ┆ str       ┆          │
╞═══════════╪═══════════╪═══════════╪═══════════╪═══╪═══════════╪═══════════╪═══════════╪══════════╡
│ https://r ┆ APLIKASI  ┆ Penggunaa ┆ Thesis /  ┆ … ┆ Thania    ┆ 30 Aug    ┆ 30 Aug    ┆ aplikasi │
│ epository ┆ VARIABLE  ┆ n         ┆ Skripsi / ┆   ┆ Anggita   ┆ 2021      ┆ 2021      ┆ variable │
│ .pnj.ac.i ┆ SPEED     ┆ Variable  ┆ Tugas     ┆   ┆ Nada      ┆ 18:07     ┆ 18:07     ┆ speed    │
│ d/i…      ┆ DRIVE …   ┆ Speed 

## Modeling


In [89]:
# Read the dataset from a CSV file
df = read_csv(file_path_dataset_clean)

# Initialize a TF-IDF vectorizer
vectorizer = TfidfVectorizer()

# Fit the TF-IDF model to the 'f' column of the dataset
tfidf_matrix = vectorizer.fit_transform(df["f"].to_list())

# print("Feature names: ", vectorizer.get_feature_names_out())
# print("TF-IDF Matrix: ", tfidf_matrix.toarray())

from types import FunctionType


def find_similar_documents(title: str, abstract: str, top_n: int):
    from mpstemmer import MPStemmer
    from numpy import argsort
    from polars import Series
    from re import sub
    from Sastrawi.StopWordRemover.StopWordRemoverFactory import StopWordRemoverFactory
    from sklearn.metrics.pairwise import cosine_similarity

    # Combine title and abstract into one string and convert to lowercase
    combined_text = (title + " " + abstract).lower()

    # Remove all non-alphabetic characters and replace them with spaces
    cleaned_text = sub(r"[^a-z]", " ", combined_text)

    # Split the cleaned text into words and rejoin to remove extra spaces
    trimmed_text = " ".join(cleaned_text.split())

    # Apply stemming to the cleaned text
    stemmed_text = MPStemmer().stem_kalimat(trimmed_text)

    # Remove stop words from the stemmed text
    processed_text = (
        StopWordRemoverFactory().create_stop_word_remover().remove(stemmed_text)
    )

    # Compute cosine similarity between the processed text and the TF-IDF matrix
    cosine_scores = cosine_similarity(
        vectorizer.transform([processed_text]), tfidf_matrix
    ).flatten()

    # Get the indices of the top N most similar documents
    top_n_indices = argsort(cosine_scores)[-top_n:][::-1]

    # Return the top N most similar documents along with their similarity scores
    return (
        df[top_n_indices]
        .with_columns(
            Series(name="similarity", values=cosine_scores[top_n_indices] * 100)
        )
        .drop("f")
    )

In [90]:
with open(file_path_model, "rb") as file:
    model = file.read()

print(
    FunctionType(loads(model), globals())(
        title="APLIKASI VARIABLE SPEED DRIVE ATV610U75N4 PADA KONTROL MOTOR AC 3 FASA BERBASIS PLC",
        abstract="Penggunaan Variable Speed Drive (VSD) pada motor induksi tiga fasa dapat mengurangi konsumsi energi yang dibutuhkan oleh peralatan secara signifikan. Pengaturan kecepatan motor induksi dapat dilakukan dengan cara mengatur tegangan sumber atau frekuensi sumber yang dimaksudkan untuk mendapatkan kecepatan putaran dan torsi motor yang diinginkan atau sesuai dengan kebutuhan. Dengan pengaplikasian Variable Speed Drive (VSD) kecepatan motor dapat dikontrol dan beroperasi dengan mode multi speed. Panel kontrol kecepatan motor ini digunakan untuk memantau dan mengatur kecepatan motor induksi tiga fasa dengan komponen utama yang terdiri dari inverter / VSD tipe ATV610U75N4, PLC, SCADA, dan motor induksi sebagai output. Inverter atau VSD (Variable Speed Drive) digunakan sebagai komponen pengatur kecepatan operasi motor induksi tiga fasa dengan mengatur frekuensi keluaran. PLC sebagai pengontrol urutan dan mengatur input output yang kemudian diproses untuk menghasilkan output yang diinginkan. Sedangkan SCADA digunakan sebagai pengendali jarak jauhnya. Pengaturan kecepatan motor dilakukan dengan mengatur besar frekuensi di inverter. Semakin besar nilai frekuensi maka putaran motor akan lebih cepat. Pada VSD tipe ATV610U75N4 terdapat jenis gangguan fasa loss (Output Phase Loss). Gangguan OPL dari inverter dapat membuat motor tidak dapat bekerja karena daya motor yang tidak memenuhi batas pengaturan (setting) pada inverter yaitu 1,5 kW sedangkan motor yang digunakan sebesar 0,25 kW. Kata Kunci : Frekuensi, Motor, PLC, SCADA, VSD",
        top_n=1000,
    )
)

shape: (1_000, 10)
┌───────────┬───────────┬───────────┬───────────┬───┬───────────┬───────────┬───────────┬──────────┐
│ url       ┆ title     ┆ abstract  ┆ document_ ┆ … ┆ user_id   ┆ date_depo ┆ last_modi ┆ similari │
│ ---       ┆ ---       ┆ ---       ┆ type      ┆   ┆ ---       ┆ sited     ┆ fied      ┆ ty       │
│ str       ┆ str       ┆ str       ┆ ---       ┆   ┆ str       ┆ ---       ┆ ---       ┆ ---      │
│           ┆           ┆           ┆ str       ┆   ┆           ┆ str       ┆ str       ┆ f64      │
╞═══════════╪═══════════╪═══════════╪═══════════╪═══╪═══════════╪═══════════╪═══════════╪══════════╡
│ https://r ┆ APLIKASI  ┆ Penggunaa ┆ Thesis /  ┆ … ┆ Thania    ┆ 30 Aug    ┆ 30 Aug    ┆ 100.0    │
│ epository ┆ VARIABLE  ┆ n         ┆ Skripsi / ┆   ┆ Anggita   ┆ 2021      ┆ 2021      ┆          │
│ .pnj.ac.i ┆ SPEED     ┆ Variable  ┆ Tugas     ┆   ┆ Nada      ┆ 18:07     ┆ 18:07     ┆          │
│ d/i…      ┆ DRIVE …   ┆ Speed     ┆ Akhir…    ┆   ┆           ┆       

## Evaluation


In [91]:
def evaluate_model(df, model_func):
    total_documents = df.height
    total_accuracy = 0.0

    for i in range(total_documents):
        similar_documents = model_func(
            title=df["title"][i], abstract=df["abstract"][i], top_n=10
        )

        accuracy = similar_documents.filter(similar_documents["url"] == df["url"][i])[
            "similarity"
        ][0]

        print(f"{i + 1}/{total_documents}: {accuracy:.2f}%")

        total_accuracy += accuracy

    return total_accuracy / total_documents


with open(file_path_model, "rb") as file:
    model_bytes = file.read()

average_accuracy = evaluate_model(
    read_csv(file_path_dataset_clean), FunctionType(loads(model_bytes), globals())
)

print(f"Average Accuracy: {average_accuracy}%")

1/4712: 100.00%
2/4712: 100.00%
3/4712: 100.00%
4/4712: 100.00%
5/4712: 100.00%
6/4712: 100.00%
7/4712: 100.00%
8/4712: 100.00%
9/4712: 100.00%
10/4712: 100.00%
11/4712: 100.00%
12/4712: 100.00%
13/4712: 100.00%
14/4712: 100.00%
15/4712: 100.00%
16/4712: 100.00%
17/4712: 100.00%
18/4712: 100.00%
19/4712: 100.00%
20/4712: 100.00%
21/4712: 100.00%
22/4712: 100.00%
23/4712: 100.00%
24/4712: 100.00%
25/4712: 100.00%
26/4712: 100.00%
27/4712: 100.00%
28/4712: 100.00%
29/4712: 100.00%
30/4712: 100.00%
31/4712: 100.00%
32/4712: 100.00%
33/4712: 100.00%
34/4712: 100.00%
35/4712: 100.00%
36/4712: 100.00%
37/4712: 100.00%
38/4712: 100.00%
39/4712: 100.00%
40/4712: 100.00%
41/4712: 100.00%
42/4712: 100.00%
43/4712: 100.00%
44/4712: 100.00%
45/4712: 100.00%
46/4712: 100.00%
47/4712: 100.00%
48/4712: 100.00%
49/4712: 100.00%
50/4712: 100.00%
51/4712: 100.00%
52/4712: 100.00%
53/4712: 100.00%
54/4712: 100.00%
55/4712: 100.00%
56/4712: 100.00%
57/4712: 100.00%
58/4712: 100.00%
59/4712: 100.00%
60/471