In [39]:
import pandas as pd
import nltk
import math
import re



### 1. **Likelihood**
- **Likelihood** mengukur probabilitas suatu data (biasanya kumpulan kalimat atau observasi) diberikan sebuah model probabilistik. Dalam konteks PCFG, likelihood menunjukkan sejauh mana model PCFG menjelaskan data input berdasarkan aturan-aturan dan probabilitas yang diberikan.

- **Matematis:**
  Likelihood dihitung sebagai:
  \[
  \mathcal{L}(D \mid \theta) = P(D \mid \theta)
  \]
  di mana:
  - \(D\) adalah data (contoh: kumpulan kalimat),
  - \(\theta\) adalah parameter model (contoh: probabilitas pada aturan PCFG).

- **Log-Likelihood:**
  Karena probabilitas sering kali sangat kecil, log-likelihood lebih sering digunakan:
  \[
  \log \mathcal{L}(D \mid \theta) = \sum_{i=1}^N \log P(x_i \mid \theta)
  \]
  di mana \(x_i\) adalah kalimat atau data individu dalam dataset \(D\).

---

### 2. **Perplexity**
- **Perplexity** adalah metrik yang menunjukkan seberapa "kaget" model terhadap data baru. Semakin rendah perplexity, semakin baik model dalam memprediksi data tersebut.

- **Matematis:**
  Perplexity didefinisikan sebagai:
  \[
  PP(D) = 2^{-\frac{1}{N} \sum_{i=1}^N \log_2 P(x_i \mid \theta)}
  \]
  di mana:
  - \(N\) adalah jumlah total token (atau kalimat) dalam dataset \(D\),
  - \(P(x_i \mid \theta)\) adalah probabilitas data \(x_i\) menurut model.

  Dalam bentuk lain:
  \[
  PP(D) = \exp\left(-\frac{1}{N} \sum_{i=1}^N \log P(x_i \mid \theta)\right)
  \]
  Perplexity rendah berarti model memiliki kemampuan yang lebih baik untuk "menjelaskan" data.



In [40]:
df = pd.read_excel("raw/df_test_fixed.xlsx", usecols=["Sentence", "Parsing"])
df = df.rename(columns={"Parsing": "Parsed"})
df["Sentence"] = df["Sentence"].apply(lambda x: x.lower().split())
# df["Parsed"] = df["Parsed"].apply(lambda x: nltk.Tree.fromstring(x))
# df["Productions"] = df["Parsed"].apply(lambda x: x.productions())
df

Unnamed: 0,Sentence,Parsed
0,"[ade, buron, tawah, to, to, ya, meme]",(K\n (S (NP (Noun ade)))\n (P (NP (NP (Prono...
1,"[ni, bawang, laut, megedi, sambilange, ngeling...",(K\n (S (NP (Noun ni)))\n (P (NP (NP (Noun b...
2,"[aduh, keker, keker]",(K (S (NP (Pronoun aduh))) (P (NP (NP (Noun ke...
3,"[wenten, kaasi, i, kelesih, di, selagan, nebe]",(K\n (S (NP (NP (NP (Noun wenten)) (Det kaasi...
4,"[buin, kasautin, ban, i, lelipi, kene]",(K\n (S (NP (Noun buin)))\n (P (NP (Noun kas...
...,...,...
165,"[da, nangis, ratu, raden, galuh]",(K\n (S (NP (Pronoun da)))\n (P (NP (NP (Nou...
166,"[mati, lantas, i, lutung]",(K\n (S (NP (Pronoun mati)))\n (P (NP (NP (N...
167,"[mulihan, lantas, i, patih]",(K\n (S (NP (Pronoun mulihan)))\n (P (NP (NP...
168,"[mulihan, lantas, raden, mantri, teken, iringane]",(K\n (S (NP (Pronoun mulihan)))\n (P (NP (No...


In [41]:
# tulis rule pcfg
df_rules = pd.read_csv("rules/probabilistic_rules.csv")
grammar_pcfg = ""
for prod, prob in zip(df_rules["production"], df_rules["probability"]):
  grammar_pcfg += f"{prod} [{prob}]\n"
grammar_pcfg = nltk.PCFG.fromstring(grammar_pcfg)
grammar_pcfg._start = nltk.Nonterminal("K")
parser_pcfg = nltk.ViterbiParser(grammar_pcfg)

In [42]:
df["Predicted"] = pd.Series("", index=range(len(df)))
df["Predicted_Score"] = pd.Series(0, index=range(len(df)))
for i, row in df.iterrows():
  for tree in parser_pcfg.parse(row["Sentence"]):
    df.loc[i, "Predicted"] = re.sub(r" \(p=.*\)", "", str(tree))
    df.loc[i, "Predicted_Score"] = tree.prob()
df

  df.loc[i, "Predicted_Score"] = tree.prob()


Unnamed: 0,Sentence,Parsed,Predicted,Predicted_Score
0,"[ade, buron, tawah, to, to, ya, meme]",(K\n (S (NP (Noun ade)))\n (P (NP (NP (Prono...,(K\n (S (NP (Noun ade)))\n (P\n (NP\n ...,4.212710e-24
1,"[ni, bawang, laut, megedi, sambilange, ngeling...",(K\n (S (NP (Noun ni)))\n (P (NP (NP (Noun b...,(K\n (S (NP (Noun ni)))\n (P (NP (Noun bawan...,1.057747e-24
2,"[aduh, keker, keker]",(K (S (NP (Pronoun aduh))) (P (NP (NP (Noun ke...,(K\n (S (NP (Pronoun aduh)))\n (P (NP (Noun ...,2.847637e-10
3,"[wenten, kaasi, i, kelesih, di, selagan, nebe]",(K\n (S (NP (NP (NP (Noun wenten)) (Det kaasi...,(K\n (S (NP (NP (NP (Noun wenten)) (Det kaasi...,1.814254e-21
4,"[buin, kasautin, ban, i, lelipi, kene]",(K\n (S (NP (Noun buin)))\n (P (NP (Noun kas...,(K\n (S (NP (Noun buin)))\n (P\n (NP\n ...,1.846960e-17
...,...,...,...,...
165,"[da, nangis, ratu, raden, galuh]",(K\n (S (NP (Pronoun da)))\n (P (NP (NP (Nou...,(K\n (S (NP (NP (Pronoun da)) (Det nangis)))\...,5.603122e-15
166,"[mati, lantas, i, lutung]",(K\n (S (NP (Pronoun mati)))\n (P (NP (NP (N...,(K\n (S (NP (Pronoun mati)))\n (P (NP (NP (N...,7.960556e-09
167,"[mulihan, lantas, i, patih]",(K\n (S (NP (Pronoun mulihan)))\n (P (NP (NP...,(K\n (S (NP (Pronoun mulihan)))\n (P (NP (NP...,2.186966e-11
168,"[mulihan, lantas, raden, mantri, teken, iringane]",(K\n (S (NP (Pronoun mulihan)))\n (P (NP (No...,(K\n (S (NP (Pronoun mulihan)))\n (P (NP (No...,3.355596e-17


In [43]:
log_likelihood = df["Predicted_Score"].apply(lambda p: math.log10(p)).sum()
print(f"Log Likelihood: {log_likelihood}")

Log Likelihood: -2873.9262031920134


In [44]:
n = df["Parsed"].apply(lambda x: nltk.Tree.fromstring(x).productions()).apply(len).sum()
perplexity = 2**((-1 / n) * log_likelihood)
print(f"Perplexity: {perplexity}")

Perplexity: 2.1464693446400522


In [45]:
print(df["Predicted"][0])

(K
  (S (NP (Noun ade)))
  (P
    (NP
      (NP (NP (NP (Pronoun buron)) (Det tawah)) (Det to))
      (Det to)))
  (O (NP (Pronoun ya)))
  (Pel (NP (Noun meme))))


In [46]:
df["Parsed"] = df["Parsed"].apply(lambda x: nltk.Tree.fromstring(x))
df["Predicted"] = df["Predicted"].apply(lambda x: nltk.Tree.fromstring(x))
df

Unnamed: 0,Sentence,Parsed,Predicted,Predicted_Score
0,"[ade, buron, tawah, to, to, ya, meme]","[[[(Noun ade)]], [[(NP (Pronoun buron)), (Det ...","[[[(Noun ade)]], [[(NP (NP (NP (Pronoun buron)...",4.212710e-24
1,"[ni, bawang, laut, megedi, sambilange, ngeling...","[[[(Noun ni)]], [[(NP (Noun bawang)), (Det lau...","[[[(Noun ni)]], [[(Noun bawang)]], [[(NP (NP (...",1.057747e-24
2,"[aduh, keker, keker]","[[[(Pronoun aduh)]], [[(NP (Noun keker)), (Det...","[[[(Pronoun aduh)]], [[(Noun keker)]], [[(Noun...",2.847637e-10
3,"[wenten, kaasi, i, kelesih, di, selagan, nebe]","[[[(NP (NP (Noun wenten)) (Det kaasi)), (Det i...","[[[(NP (NP (Noun wenten)) (Det kaasi)), (Det i...",1.814254e-21
4,"[buin, kasautin, ban, i, lelipi, kene]","[[[(Noun buin)]], [[(Noun kasautin)]], [[(NP (...","[[[(Noun buin)]], [[(NP (NP (NP (Noun kasautin...",1.846960e-17
...,...,...,...,...
165,"[da, nangis, ratu, raden, galuh]","[[[(Pronoun da)]], [[(NP (Noun nangis)), (Det ...","[[[(NP (Pronoun da)), (Det nangis)]], [[(Noun ...",5.603122e-15
166,"[mati, lantas, i, lutung]","[[[(Pronoun mati)]], [[(NP (NP (Noun lantas)) ...","[[[(Pronoun mati)]], [[(NP (Noun lantas)), (De...",7.960556e-09
167,"[mulihan, lantas, i, patih]","[[[(Pronoun mulihan)]], [[(NP (NP (Noun lantas...","[[[(Pronoun mulihan)]], [[(NP (Noun lantas)), ...",2.186966e-11
168,"[mulihan, lantas, raden, mantri, teken, iringane]","[[[(Pronoun mulihan)]], [[(Noun lantas)]], [[(...","[[[(Pronoun mulihan)]], [[(Noun lantas)]], [[(...",3.355596e-17


In [47]:
df["Intersection"] = df.apply(lambda row: list(set(row["Parsed"].productions()).intersection(set(row["Predicted"].productions()))), axis=1)
df["TP"] = df["Intersection"].apply(len)
df["FP"] = df.apply(lambda row: list(set(row["Predicted"].productions()) - set(row["Parsed"].productions())), axis=1).apply(len)
df["FN"] = df.apply(lambda row: list(set(row["Parsed"].productions()) - set(row["Predicted"].productions())), axis=1).apply(len)
df

Unnamed: 0,Sentence,Parsed,Predicted,Predicted_Score,Intersection,TP,FP,FN
0,"[ade, buron, tawah, to, to, ya, meme]","[[[(Noun ade)]], [[(NP (Pronoun buron)), (Det ...","[[[(Noun ade)]], [[(NP (NP (NP (Pronoun buron)...",4.212710e-24,"[NP -> NP Det, P -> NP, Noun -> 'meme', NP -> ...",14,0,2
1,"[ni, bawang, laut, megedi, sambilange, ngeling...","[[[(Noun ni)]], [[(NP (Noun bawang)), (Det lau...","[[[(Noun ni)]], [[(Noun bawang)]], [[(NP (NP (...",1.057747e-24,"[Noun -> 'megedi', Det -> 'sambilange', NP -> ...",15,1,1
2,"[aduh, keker, keker]","[[[(Pronoun aduh)]], [[(NP (Noun keker)), (Det...","[[[(Pronoun aduh)]], [[(Noun keker)]], [[(Noun...",2.847637e-10,"[P -> NP, NP -> Pronoun, Noun -> 'keker', Pron...",6,2,3
3,"[wenten, kaasi, i, kelesih, di, selagan, nebe]","[[[(NP (NP (Noun wenten)) (Det kaasi)), (Det i...","[[[(NP (NP (Noun wenten)) (Det kaasi)), (Det i...",1.814254e-21,"[NP -> NP Det, P -> NP, Det -> 'kaasi', PP -> ...",15,0,0
4,"[buin, kasautin, ban, i, lelipi, kene]","[[[(Noun buin)]], [[(Noun kasautin)]], [[(NP (...","[[[(Noun buin)]], [[(NP (NP (NP (Noun kasautin...",1.846960e-17,"[NP -> NP Det, P -> NP, Noun -> 'kasautin', NP...",9,4,5
...,...,...,...,...,...,...,...,...
165,"[da, nangis, ratu, raden, galuh]","[[[(Pronoun da)]], [[(NP (Noun nangis)), (Det ...","[[[(NP (Pronoun da)), (Det nangis)]], [[(Noun ...",5.603122e-15,"[NP -> NP Det, P -> NP, NP -> Pronoun, Pel -> ...",10,2,2
166,"[mati, lantas, i, lutung]","[[[(Pronoun mati)]], [[(NP (NP (Noun lantas)) ...","[[[(Pronoun mati)]], [[(NP (Noun lantas)), (De...",7.960556e-09,"[NP -> NP Det, P -> NP, NP -> Pronoun, Noun ->...",9,2,2
167,"[mulihan, lantas, i, patih]","[[[(Pronoun mulihan)]], [[(NP (NP (Noun lantas...","[[[(Pronoun mulihan)]], [[(NP (Noun lantas)), ...",2.186966e-11,"[NP -> NP Det, Pronoun -> 'mulihan', P -> NP, ...",8,3,2
168,"[mulihan, lantas, raden, mantri, teken, iringane]","[[[(Pronoun mulihan)]], [[(Noun lantas)]], [[(...","[[[(Pronoun mulihan)]], [[(Noun lantas)]], [[(...",3.355596e-17,"[NP -> NP Det, PP -> Prep NP, Pronoun -> 'muli...",15,0,0


In [48]:
tp = df["TP"].sum()
fp = df["FP"].sum()
fn = df["FN"].sum()
precision = tp / (tp + fp) if (tp + fp) > 0 else 0
recall = tp / (tp + fn) if (tp + fn) > 0 else 0
f1_score = 2 * (precision * recall) / (precision + recall) if (precision + recall) > 0 else 0
print(f"Precision: {precision}")
print(f"Recall   : {recall}")
print(f"F1-Score : {f1_score}")

Precision: 0.8865750996898538
Recall   : 0.8857901726427623
F1-Score : 0.8861824623560672
