In [1]:
import json
import math
from numpy.linalg import norm
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import nltk
import re
from Sastrawi.Stemmer.StemmerFactory import StemmerFactory
import string
from string import digits
from sklearn.metrics import mean_absolute_error as mae
from sklearn.pipeline import Pipeline
from spell import spell_check
from Lemma import lemmatization
from textblob import TextBlob

## Data

In [1]:
kj = input(str)

In [2]:
kj = "ITERA di bentuk berdasarkan SK kepres yg terletak di Lampung Selatan"
jawaban = "ITERA berlokasi di Lampung Selatan berdasarkan SK Menteri"

In [3]:
print("Text kunci jawaban: ", kj)
print("Text Jawaban: ",jawaban)

Text kunci jawaban:  ITERA di bentuk berdasarkan SK kepres yg terletak di Lampung Selatan
Text Jawaban:  ITERA berlokasi di Lampung Selatan berdasarkan SK Menteri


## Pre-Processing

In [4]:
def case_folding(text):
  pattern = r'[' + string.punctuation + ']'
  punct = re.sub(pattern," ",str(text))
  case_fold = punct.lower()
  return case_fold

def spellcheck(text):
  text = spell_check(text)
  return text

def tokenization(text):
    tokens = re.split(' ',text)
    return tokens

def remove_digits(text):
  text = [item for item in text if item.isalpha()]
  return text

sw= nltk.corpus.stopwords.words('indonesian')
def remove_SW(text):
   text = [item for item in text if not item in sw]
   return text

factory = StemmerFactory()
stemmer = factory.create_stemmer()

def stemming(text):
  text = [stemmer.stem(item) for item in text]
  return text

def lemma(text):
  text = lemmatization(text)
  return text


### Case folding

In [5]:
case_fold_q = case_folding(kj)
case_fold_ans = case_folding(jawaban)

print("Text Kunci Jawaban: ", case_fold_q)
print("Text Jawaban: ",case_fold_ans)

Text Kunci Jawaban:  itera di bentuk berdasarkan sk kepres yg terletak di lampung selatan
Text Jawaban:  itera berlokasi di lampung selatan berdasarkan sk menteri


### Spell check

In [6]:
spell_q = spell_check(case_fold_q)
spell_ans = spell_check(case_fold_ans)

print("Sebelum: ", case_fold_ans)
print("Sesudah:", spell_ans)

Sebelum:  itera berlokasi di lampung selatan berdasarkan sk menteri
Sesudah: itera berlokasi di lampung selatan berdasarkan sk menteri


### Tokenization

In [7]:
token_q = tokenization(spell_q)
token_ans = tokenization(spell_ans)
token_q = remove_digits(token_q)
token_ans = remove_digits(token_ans)

print("Text Kunci Jawaban: ", token_q)
print("Text Jawaban: ", token_ans)

Text Kunci Jawaban:  ['itera', 'di', 'bentuk', 'berdasarkan', 'sk', 'kepres', 'yang', 'terletak', 'di', 'lampung', 'selatan']
Text Jawaban:  ['itera', 'berlokasi', 'di', 'lampung', 'selatan', 'berdasarkan', 'sk', 'menteri']


### Filtering

In [8]:
filter_q = remove_SW(token_q)
filter_ans = remove_SW(token_ans)

print("Text Kunci Jawaban: ", filter_q)
print("Text Jawaban: ", filter_ans)

Text Kunci Jawaban:  ['itera', 'bentuk', 'berdasarkan', 'sk', 'kepres', 'terletak', 'lampung', 'selatan']
Text Jawaban:  ['itera', 'berlokasi', 'lampung', 'selatan', 'berdasarkan', 'sk', 'menteri']


### Stemming

In [9]:
stem_q = stemming(filter_q)
stem_ans = stemming(filter_ans)

print("Text Kunci Jawaban: ", stem_q)
print("Text Jawaban: ", stem_ans)

Text Kunci Jawaban:  ['itera', 'bentuk', 'dasar', 'sk', 'pres', 'letak', 'lampung', 'selatan']
Text Jawaban:  ['itera', 'lokasi', 'lampung', 'selatan', 'dasar', 'sk', 'menteri']


### Lemmatization

In [10]:
lemma_q = lemma(filter_q)
lemma_ans = lemma(filter_ans)

print("Text Kunci Jawaban: ", lemma_q)
print("Text Jawaban: ", lemma_ans)

Text Kunci Jawaban:  ['itera', 'bentuk', 'berdasarkan', 'sk', 'kepres', 'terletak', 'lampung', 'selatan']
Text Jawaban:  ['itera', 'berlokasi', 'lampung', 'selatan', 'berdasarkan', 'sk', 'menteri']


## TF-IDF + Cosine + Score

In [11]:
def term(q, ans):
    for i in q:
        if i == '':
            q.remove('')
    for i in ans:
        if i == '':
            ans.remove('')
            
    BoWQ = set(q)
    BoWA = set(ans)

    uniqueWords = BoWQ.union(BoWA)
    # print(uniqueWords)

    numOfWordsQ = dict.fromkeys(uniqueWords, 0)
    for word in q:
        numOfWordsQ[word] += 1

    numOfWordsA = dict.fromkeys(uniqueWords, 0)
    for word in ans:
        numOfWordsA[word] += 1
    
    # print('Unique words', numOfWordsA)
        
    term = pd.DataFrame([numOfWordsQ, numOfWordsA])
    term = term.transpose()
    term.columns = ['TF_Q','TF_Ans']

    # display(term)

    dfQ = dict.fromkeys(uniqueWords, 0)
    for word in BoWQ:
        dfQ[word] += 1

    dfA= dict.fromkeys(uniqueWords, 0)
    for word in BoWA:
        dfA[word] += 1
    
    term['DF_Q'] = dfQ.values()
    term['DF_A'] = dfA.values()
    
    DF = []
    for i in range(len(uniqueWords)):
        DF.append(term['DF_Q'][i] + term['DF_A'][i])
    term['DF'] = DF
    # display(term)
    
    idfDict = []
    
    for i in range(len(term['DF'])):
        idfDict.append(math.log10((2+1) / (term['DF'][i] + 1))+1)
        # print(idfDict)
    term['IDF'] = idfDict

    # display(term)

    tfidfQ = []
    tfidfA = []
    for i in range(len(uniqueWords)):
        tfidfQ.append(term['TF_Q'][i]*term['IDF'][i])
        tfidfA.append(term['TF_Ans'][i]*term['IDF'][i])

    term['TF-IDF_Q'] = np.array(tfidfQ)
    term['TF-IDF_A'] = np.array(tfidfA)

    cosine = np.dot(tfidfQ,tfidfA)/(np.linalg.norm(tfidfQ)*np.linalg.norm(tfidfA))
    
    if math.isnan(cosine):
        cosine = 0
    print("Cosine Similarity:",cosine)
    print("Total kata",len(term))
    print('Skor: ', round((cosine*100),2))

    return term

In [12]:
print(len(tokenization(jawaban)))
print(len(tokenization(kj)))
print(tokenization(kj))
print(tokenization(jawaban))

8
11
['ITERA', 'di', 'bentuk', 'berdasarkan', 'SK', 'kepres', 'yg', 'terletak', 'di', 'Lampung', 'Selatan']
['ITERA', 'berlokasi', 'di', 'Lampung', 'Selatan', 'berdasarkan', 'SK', 'Menteri']


In [13]:
print("Tanpa pre-processing",'\n')
term(tokenization(kj),tokenization(jawaban))

Tanpa pre-processing 

Cosine Similarity: 0.6201743099878875
Total kata 12
Skor:  62.02


Unnamed: 0,TF_Q,TF_Ans,DF_Q,DF_A,DF,IDF,TF-IDF_Q,TF-IDF_A
yg,1,0,1,0,1,1.176091,1.176091,0.0
terletak,1,0,1,0,1,1.176091,1.176091,0.0
berdasarkan,1,1,1,1,2,1.0,1.0,1.0
SK,1,1,1,1,2,1.0,1.0,1.0
berlokasi,0,1,0,1,1,1.176091,0.0,1.176091
Lampung,1,1,1,1,2,1.0,1.0,1.0
Selatan,1,1,1,1,2,1.0,1.0,1.0
ITERA,1,1,1,1,2,1.0,1.0,1.0
kepres,1,0,1,0,1,1.176091,1.176091,0.0
bentuk,1,0,1,0,1,1.176091,1.176091,0.0


In [14]:
print("Stemming",'\n')
term(stem_q, stem_ans)

Stemming 

Cosine Similarity: 0.5931441638256351
Total kata 10
Skor:  59.31


Unnamed: 0,TF_Q,TF_Ans,DF_Q,DF_A,DF,IDF,TF-IDF_Q,TF-IDF_A
menteri,0,1,0,1,1,1.176091,0.0,1.176091
sk,1,1,1,1,2,1.0,1.0,1.0
itera,1,1,1,1,2,1.0,1.0,1.0
lampung,1,1,1,1,2,1.0,1.0,1.0
letak,1,0,1,0,1,1.176091,1.176091,0.0
lokasi,0,1,0,1,1,1.176091,0.0,1.176091
pres,1,0,1,0,1,1.176091,1.176091,0.0
dasar,1,1,1,1,2,1.0,1.0,1.0
selatan,1,1,1,1,2,1.0,1.0,1.0
bentuk,1,0,1,0,1,1.176091,1.176091,0.0


In [15]:
print("Lemmatization",'\n')
term(lemma_q, lemma_ans)

Lemmatization 

Cosine Similarity: 0.5931441638256351
Total kata 10
Skor:  59.31


Unnamed: 0,TF_Q,TF_Ans,DF_Q,DF_A,DF,IDF,TF-IDF_Q,TF-IDF_A
menteri,0,1,0,1,1,1.176091,0.0,1.176091
sk,1,1,1,1,2,1.0,1.0,1.0
itera,1,1,1,1,2,1.0,1.0,1.0
terletak,1,0,1,0,1,1.176091,1.176091,0.0
lampung,1,1,1,1,2,1.0,1.0,1.0
berdasarkan,1,1,1,1,2,1.0,1.0,1.0
berlokasi,0,1,0,1,1,1.176091,0.0,1.176091
selatan,1,1,1,1,2,1.0,1.0,1.0
kepres,1,0,1,0,1,1.176091,1.176091,0.0
bentuk,1,0,1,0,1,1.176091,1.176091,0.0
