In [12]:
import pandas as pd
import nltk



### 1. **Likelihood**
- **Likelihood** mengukur probabilitas suatu data (biasanya kumpulan kalimat atau observasi) diberikan sebuah model probabilistik. Dalam konteks PCFG, likelihood menunjukkan sejauh mana model PCFG menjelaskan data input berdasarkan aturan-aturan dan probabilitas yang diberikan.

- **Matematis:**
  Likelihood dihitung sebagai:
  \[
  \mathcal{L}(D \mid \theta) = P(D \mid \theta)
  \]
  di mana:
  - \(D\) adalah data (contoh: kumpulan kalimat),
  - \(\theta\) adalah parameter model (contoh: probabilitas pada aturan PCFG).

- **Log-Likelihood:**
  Karena probabilitas sering kali sangat kecil, log-likelihood lebih sering digunakan:
  \[
  \log \mathcal{L}(D \mid \theta) = \sum_{i=1}^N \log P(x_i \mid \theta)
  \]
  di mana \(x_i\) adalah kalimat atau data individu dalam dataset \(D\).

---

### 2. **Perplexity**
- **Perplexity** adalah metrik yang menunjukkan seberapa "kaget" model terhadap data baru. Semakin rendah perplexity, semakin baik model dalam memprediksi data tersebut.

- **Matematis:**
  Perplexity didefinisikan sebagai:
  \[
  PP(D) = 2^{-\frac{1}{N} \sum_{i=1}^N \log_2 P(x_i \mid \theta)}
  \]
  di mana:
  - \(N\) adalah jumlah total token (atau kalimat) dalam dataset \(D\),
  - \(P(x_i \mid \theta)\) adalah probabilitas data \(x_i\) menurut model.

  Dalam bentuk lain:
  \[
  PP(D) = \exp\left(-\frac{1}{N} \sum_{i=1}^N \log P(x_i \mid \theta)\right)
  \]
  Perplexity rendah berarti model memiliki kemampuan yang lebih baik untuk "menjelaskan" data.



In [13]:
df = pd.read_excel("raw/test_kalimat.xlsx")
df

Unnamed: 0,Sentence,Parsed
0,Sengap paling Dadong Jepun mangiwasin,(K (S (NP (Noun sengap))) (P (NP (NP (Pronoun ...
1,Oo Beli mara teka Luh,(K (S (NP (Noun oo))) (P (NP (Noun beli))) (Ke...
2,Men keto suba matine katepuk,(K (S (NP (NP (Noun men)) (Noun keto))) (P (VP...
3,Uli ditu baana ngintip,(K (S (NP (Noun uli))) (P (VP (Adv ditu) (VP (...
4,Tiang mamusuh teken ia,(K (S (NP (Pronoun tiang))) (P (VP (Verb mamus...
5,jani makejang burone sakti,(K (S (NP (Noun jani))) (P (NP (Pronoun makeja...
6,Munyinne suba ngarwanang anak len,(K (S (NP (Pronoun munyinne))) (P (VP (Adv sub...
7,Metu rasa sumanangsaya idane,(K (S (NP (Pronoun metu))) (P (NP (Pronoun ras...
8,Ia tusing ngelah bulu,(K (S (NP (Pronoun ia))) (P (VP (Adv tusing) (...
9,Kaden tiang ada pancabaya,(K (S (NP (Pronoun kaden))) (P (NP (Pronoun ti...


In [14]:
df = df[df['Sentence'].str.split().apply(len) == 3]
df = df.sample(n=6, random_state=42).reset_index()
df = df[["Sentence", "Parsed"]]
df

Unnamed: 0,Sentence,Parsed
0,Keto baos Idane,(K (S (NP (Noun keto))) (P (NP (NP (Noun baos)...
1,Inggih titiang makta,(K (S (NP (Noun inggih))) (P (NP (Pronoun titi...
2,Inggih titiang sairing,(K (S (NP (Noun inggih))) (P (NP (NP (Pronoun ...
3,Tiang naur nika,(K (S (NP (Pronoun tiang))) (P (NP (NP (Noun n...
4,Sajaan ento Dong,(K (S (NP (Noun sajaan))) (P (NP (NP (Pronoun ...
5,Keto munyin memene,(K (S (NP (Noun keto))) (P (NP (NP (Noun munyi...


In [15]:
df["Predicted"] = pd.Series("", index=range(len(df)))
df["Probability"] = pd.Series(0, index=range(len(df)))
df

Unnamed: 0,Sentence,Parsed,Predicted,Probability
0,Keto baos Idane,(K (S (NP (Noun keto))) (P (NP (NP (Noun baos)...,,0
1,Inggih titiang makta,(K (S (NP (Noun inggih))) (P (NP (Pronoun titi...,,0
2,Inggih titiang sairing,(K (S (NP (Noun inggih))) (P (NP (NP (Pronoun ...,,0
3,Tiang naur nika,(K (S (NP (Pronoun tiang))) (P (NP (NP (Noun n...,,0
4,Sajaan ento Dong,(K (S (NP (Noun sajaan))) (P (NP (NP (Pronoun ...,,0
5,Keto munyin memene,(K (S (NP (Noun keto))) (P (NP (NP (Noun munyi...,,0


In [None]:
df_rules = pd.read_csv("rules/probabilistic_rules.csv")
grammar_cfg = ""
for prod in df_rules["production"]:
  grammar_cfg += f"{prod}\n"
grammar_cfg = nltk.CFG.fromstring(grammar_cfg)
grammar_cfg._start = nltk.Nonterminal("K")
parser_cfg = nltk.ChartParser(grammar_cfg)

for s, value_s in df.iterrows():
  sentence = value_s["Sentence"].lower().split()
  result = pd.DataFrame({"tree": list(parser_cfg.parse(sentence)), "probability": [1 for _ in range(len(list(parser_cfg.parse(sentence))))]})
  for i, value_i in result.iterrows():
    for j in value_i["tree"].productions():
      for k, value_k in df_rules.iterrows():
        if nltk.CFG.fromstring(value_k["production"]).productions()[0] == j:
          result["probability"][i] *= value_k["probability"]
          continue
  # print(result["tree"][0])
  df.loc[s, "Predicted"] = str(result["tree"][0])
  df.loc[s, "Probability"] = result["probability"][0]

df

You are setting values through chained assignment. Currently this works in certain cases, but when using Copy-on-Write (which will become the default behaviour in pandas 3.0) this will never work to update the original DataFrame or Series, because the intermediate object on which we are setting values will behave as a copy.
A typical example is when you are setting values in a column of a DataFrame, like:

df["col"][row_indexer] = value

Use `df.loc[row_indexer, "col"] = values` instead, to perform the assignment in a single step and ensure this keeps updating the original `df`.

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy

  result["probability"][i] *= value_k["probability"]
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  result["prob

Unnamed: 0,Sentence,Parsed,Predicted,Probability
0,Keto baos Idane,(K (S (NP (Noun keto))) (P (NP (NP (Noun baos)...,(K (S (NP (Noun keto))) (P (NP (NP (Noun baos)...,2.9373470923609837e-17
1,Inggih titiang makta,(K (S (NP (Noun inggih))) (P (NP (Pronoun titi...,(K\n (S (NP (PropNoun inggih)))\n (P (VP (Ad...,1.0388076354576695e-11
2,Inggih titiang sairing,(K (S (NP (Noun inggih))) (P (NP (NP (Pronoun ...,(K\n (S (NP (PropNoun inggih)))\n (P (VP (Ad...,1.0388076354576695e-11
3,Tiang naur nika,(K (S (NP (Pronoun tiang))) (P (NP (NP (Noun n...,(K (S (NP (Noun tiang))) (P (NP (NP (Noun naur...,6.529864821783473e-15
4,Sajaan ento Dong,(K (S (NP (Noun sajaan))) (P (NP (NP (Pronoun ...,(K (S (NP (Noun sajaan))) (P (VP (Adv ento) (V...,3.1867870010596813e-12
5,Keto munyin memene,(K (S (NP (Noun keto))) (P (NP (NP (Noun munyi...,(K\n (S (NP (Noun keto)))\n (P (NP (NP (Prop...,2.898920203245869e-19


In [18]:
df["Probability"] = df["Probability"].astype(float)
df

Unnamed: 0,Sentence,Parsed,Predicted,Probability
0,Keto baos Idane,(K (S (NP (Noun keto))) (P (NP (NP (Noun baos)...,(K (S (NP (Noun keto))) (P (NP (NP (Noun baos)...,2.937347e-17
1,Inggih titiang makta,(K (S (NP (Noun inggih))) (P (NP (Pronoun titi...,(K\n (S (NP (PropNoun inggih)))\n (P (VP (Ad...,1.038808e-11
2,Inggih titiang sairing,(K (S (NP (Noun inggih))) (P (NP (NP (Pronoun ...,(K\n (S (NP (PropNoun inggih)))\n (P (VP (Ad...,1.038808e-11
3,Tiang naur nika,(K (S (NP (Pronoun tiang))) (P (NP (NP (Noun n...,(K (S (NP (Noun tiang))) (P (NP (NP (Noun naur...,6.529865e-15
4,Sajaan ento Dong,(K (S (NP (Noun sajaan))) (P (NP (NP (Pronoun ...,(K (S (NP (Noun sajaan))) (P (VP (Adv ento) (V...,3.186787e-12
5,Keto munyin memene,(K (S (NP (Noun keto))) (P (NP (NP (Noun munyi...,(K\n (S (NP (Noun keto)))\n (P (NP (NP (Prop...,2.8989199999999995e-19


In [19]:
import math

In [20]:
log_likelihood = df["Probability"].apply(lambda p: math.log10(p)).sum()
log_likelihood

np.float64(-82.71848097689603)

In [21]:
perplexity = 2**((-1 / len(df)) * log_likelihood)
perplexity

np.float64(14129.408943312555)

In [22]:
df.to_excel("raw/tested_kalimat_.xlsx", index=False)