In [50]:
import pandas as pd
import nltk
import math



### 1. **Likelihood**
- **Likelihood** mengukur probabilitas suatu data (biasanya kumpulan kalimat atau observasi) diberikan sebuah model probabilistik. Dalam konteks PCFG, likelihood menunjukkan sejauh mana model PCFG menjelaskan data input berdasarkan aturan-aturan dan probabilitas yang diberikan.

- **Matematis:**
  Likelihood dihitung sebagai:
  \[
  \mathcal{L}(D \mid \theta) = P(D \mid \theta)
  \]
  di mana:
  - \(D\) adalah data (contoh: kumpulan kalimat),
  - \(\theta\) adalah parameter model (contoh: probabilitas pada aturan PCFG).

- **Log-Likelihood:**
  Karena probabilitas sering kali sangat kecil, log-likelihood lebih sering digunakan:
  \[
  \log \mathcal{L}(D \mid \theta) = \sum_{i=1}^N \log P(x_i \mid \theta)
  \]
  di mana \(x_i\) adalah kalimat atau data individu dalam dataset \(D\).

---

### 2. **Perplexity**
- **Perplexity** adalah metrik yang menunjukkan seberapa "kaget" model terhadap data baru. Semakin rendah perplexity, semakin baik model dalam memprediksi data tersebut.

- **Matematis:**
  Perplexity didefinisikan sebagai:
  \[
  PP(D) = 2^{-\frac{1}{N} \sum_{i=1}^N \log_2 P(x_i \mid \theta)}
  \]
  di mana:
  - \(N\) adalah jumlah total token (atau kalimat) dalam dataset \(D\),
  - \(P(x_i \mid \theta)\) adalah probabilitas data \(x_i\) menurut model.

  Dalam bentuk lain:
  \[
  PP(D) = \exp\left(-\frac{1}{N} \sum_{i=1}^N \log P(x_i \mid \theta)\right)
  \]
  Perplexity rendah berarti model memiliki kemampuan yang lebih baik untuk "menjelaskan" data.



In [51]:
df = pd.read_excel("raw/df_test_fixed.xlsx", usecols=["Sentence", "Parsing"])
df = df.rename(columns={"Parsing": "Parsed"})
df["Sentence"] = df["Sentence"].apply(lambda x: x.lower().split())
df["Parsed"] = df["Parsed"].apply(lambda x: nltk.Tree.fromstring(x))
df["Productions"] = df["Parsed"].apply(lambda x: x.productions())
df

Unnamed: 0,Sentence,Parsed,Productions
0,"[ade, buron, tawah, to, to, ya, meme]","[[[(Noun ade)]], [[(NP (Pronoun buron)), (Det ...","[K -> S P O Pel, S -> NP, NP -> Noun, Noun -> ..."
1,"[ni, bawang, laut, megedi, sambilange, ngeling...","[[[(Noun ni)]], [[(NP (Noun bawang)), (Det lau...","[K -> S P O Pel, S -> NP, NP -> Noun, Noun -> ..."
2,"[aduh, keker, keker]","[[[(Pronoun aduh)]], [[(NP (Noun keker)), (Det...","[K -> S P, S -> NP, NP -> Pronoun, Pronoun -> ..."
3,"[wenten, kaasi, i, kelesih, di, selagan, nebe]","[[[(NP (NP (Noun wenten)) (Det kaasi)), (Det i...","[K -> S P Ket, S -> NP, NP -> NP Det, NP -> NP..."
4,"[buin, kasautin, ban, i, lelipi, kene]","[[[(Noun buin)]], [[(Noun kasautin)]], [[(NP (...","[K -> S P O Pel, S -> NP, NP -> Noun, Noun -> ..."
...,...,...,...
165,"[da, nangis, ratu, raden, galuh]","[[[(Pronoun da)]], [[(NP (Noun nangis)), (Det ...","[K -> S P Pel, S -> NP, NP -> Pronoun, Pronoun..."
166,"[mati, lantas, i, lutung]","[[[(Pronoun mati)]], [[(NP (NP (Noun lantas)) ...","[K -> S P, S -> NP, NP -> Pronoun, Pronoun -> ..."
167,"[mulihan, lantas, i, patih]","[[[(Pronoun mulihan)]], [[(NP (NP (Noun lantas...","[K -> S P, S -> NP, NP -> Pronoun, Pronoun -> ..."
168,"[mulihan, lantas, raden, mantri, teken, iringane]","[[[(Pronoun mulihan)]], [[(Noun lantas)]], [[(...","[K -> S P O Ket, S -> NP, NP -> Pronoun, Prono..."


In [52]:
df_sentences_rules = pd.DataFrame({"production": pd.Series(), "sum_production": pd.Series(), "sum_lhs": pd.Series()})
for productions in df["Productions"]:
  for production in productions:
    df_sentences_rules = pd.concat([df_sentences_rules, pd.DataFrame({"production": pd.Series([production]), "sum_production": pd.Series([1]), "sum_lhs": pd.Series([0])})], ignore_index=True)
df_sentences_rules["lhs"] = df_sentences_rules["production"].apply(lambda n: n.lhs())
df_sentences_rules["production"] = df_sentences_rules["production"].astype(str)
df_sentences_rules["lhs"] = df_sentences_rules["lhs"].astype(str)
df_sentences_rules = df_sentences_rules.groupby("production").agg({
  "lhs": "first",
  "sum_production": "sum",
  "sum_lhs": "sum"
}).reset_index()
df_sentences_rules["sum_lhs"] = df_sentences_rules.groupby("lhs")["sum_production"].transform("sum")
df_sentences_rules = df_sentences_rules[df_sentences_rules["sum_lhs"] > 0]
df_sentences_rules['probability'] = df_sentences_rules['sum_production'] / df_sentences_rules['sum_lhs']
df_sentences_rules

Unnamed: 0,production,lhs,sum_production,sum_lhs,probability
0,Adj -> 'barak',Adj,1,14,0.071429
1,Adj -> 'beneh',Adj,1,14,0.071429
2,Adj -> 'ical',Adj,2,14,0.142857
3,Adj -> 'lebeng',Adj,1,14,0.071429
4,Adj -> 'matur',Adj,1,14,0.071429
...,...,...,...,...,...
623,Verb -> 'pramesti',Verb,1,27,0.037037
624,Verb -> 'sampi',Verb,1,27,0.037037
625,Verb -> 'sube',Verb,1,27,0.037037
626,Verb -> 'sumandea',Verb,1,27,0.037037


In [53]:
grammar_pcfg = ""
for prod, prob in zip(df_sentences_rules["production"], df_sentences_rules["probability"]):
  grammar_pcfg += f"{prod} [{prob}]\n"
grammar_pcfg = nltk.PCFG.fromstring(grammar_pcfg)
grammar_pcfg._start = nltk.Nonterminal("K")
parser_pcfg = nltk.ViterbiParser(grammar_pcfg)

In [54]:
# df["Predicted"] = pd.Series("", index=range(len(df)))
df["Probability"] = pd.Series(0, index=range(len(df)))

for i, row in df.iterrows():
  try:
    for tree in parser_pcfg.parse(row["Sentence"]):
      # df.loc[i, "Predicted"] = tree
      df.loc[i, "Probability"] = tree.prob()
  except Exception as e:
    # df.loc[i, "Predicted"] = ""
    df.loc[i, "Probability"] = 0
df

  df.loc[i, "Probability"] = tree.prob()


Unnamed: 0,Sentence,Parsed,Productions,Probability
0,"[ade, buron, tawah, to, to, ya, meme]","[[[(Noun ade)]], [[(NP (Pronoun buron)), (Det ...","[K -> S P O Pel, S -> NP, NP -> Noun, Noun -> ...",3.414092e-21
1,"[ni, bawang, laut, megedi, sambilange, ngeling...","[[[(Noun ni)]], [[(NP (Noun bawang)), (Det lau...","[K -> S P O Pel, S -> NP, NP -> Noun, Noun -> ...",2.208987e-22
2,"[aduh, keker, keker]","[[[(Pronoun aduh)]], [[(NP (Noun keker)), (Det...","[K -> S P, S -> NP, NP -> Pronoun, Pronoun -> ...",9.362428e-10
3,"[wenten, kaasi, i, kelesih, di, selagan, nebe]","[[[(NP (NP (Noun wenten)) (Det kaasi)), (Det i...","[K -> S P Ket, S -> NP, NP -> NP Det, NP -> NP...",1.489940e-19
4,"[buin, kasautin, ban, i, lelipi, kene]","[[[(Noun buin)]], [[(Noun kasautin)]], [[(NP (...","[K -> S P O Pel, S -> NP, NP -> Noun, Noun -> ...",8.016860e-16
...,...,...,...,...
165,"[da, nangis, ratu, raden, galuh]","[[[(Pronoun da)]], [[(NP (Noun nangis)), (Det ...","[K -> S P Pel, S -> NP, NP -> Pronoun, Pronoun...",1.623036e-14
166,"[mati, lantas, i, lutung]","[[[(Pronoun mati)]], [[(NP (NP (Noun lantas)) ...","[K -> S P, S -> NP, NP -> Pronoun, Pronoun -> ...",1.676910e-08
167,"[mulihan, lantas, i, patih]","[[[(Pronoun mulihan)]], [[(NP (NP (Noun lantas...","[K -> S P, S -> NP, NP -> Pronoun, Pronoun -> ...",1.597057e-09
168,"[mulihan, lantas, raden, mantri, teken, iringane]","[[[(Pronoun mulihan)]], [[(Noun lantas)]], [[(...","[K -> S P O Ket, S -> NP, NP -> Pronoun, Prono...",3.321596e-15


In [59]:
log_likelihood = df["Probability"].apply(lambda p: math.log10(p)).sum()
print(f"Log Likelihood: {log_likelihood}")

Log Likelihood: -2575.830780129828


In [60]:
n = df["Productions"].apply(len).sum()
perplexity = 2**((-1 / n) * log_likelihood)
print(f"Perplexity: {perplexity}")

Perplexity: 1.9829731956571226
