# EN.605. 646 01 : Natural Language Processing 
# Lab 2 (Tim Chen)
## (a) Simple Character LM

Disclaimer: I am using ChatGPT to assist the results analysis.

(e.g. markdown table format & insights)

In [1]:
import assets.charlm as lm

mylm = lm.train_char_lm('assets/subtitles.txt', 4)

In [2]:
lm.print_probs(mylm, 'atio')
lm.print_probs(mylm, 'nivi')
lm.print_probs(mylm, 'supe')

[('n', 0.9940436161014506),
 (' ', 0.00220962628494572),
 ('.', 0.0013930252665962147),
 (',', 0.0009607070804111826),
 ('?', 0.0003362474781439139),
 ("'", 0.00024017677010279565),
 ('u', 0.00019214141608223654),
 ('"', 0.0001441060620616774),
 ('s', 0.0001441060620616774),
 ('-', 9.607070804111827e-05),
 ('!', 4.8035354020559135e-05),
 (':', 4.8035354020559135e-05),
 ('m', 4.8035354020559135e-05),
 ('p', 4.8035354020559135e-05),
 ('r', 4.8035354020559135e-05)]
[('n', 0.8), ('e', 0.1), ('s', 0.1)]
[('r', 0.9992144540455616), ('s', 0.0007855459544383347)]


In [3]:
# generate 10 samples of 80 characters
samples = [lm.generate_text(mylm, 4, 80) for _ in range(10)]

In [4]:
for s in [samples[4], samples[8], samples[9]]:
    print('---')
    print(s)

---
I remembered?
They welcomes these alled you're politary.
It's Smart must...
THEN
---
Syll was shirty-six his sudden.
- Moskowitzer body else if you mean in Africa of
---
I'll be suck around like what?
Yeah you play way, he's just lear the swim?
Pete 


## (b) Calculate perplexitysamples

In [5]:
perplexity_samples = [
    "The student loves homework",
    "The yob loves homework",
    "It is raining in London",
    "asdfjkl; qwerty"
]
for s in perplexity_samples:
    print(f"perplexity of {s:<30}: {lm.perplexity(s, mylm, 4):5.2f}")


perplexity of The student loves homework    :  3.76
perplexity of The yob loves homework        :   inf
perplexity of It is raining in London       :  3.06
perplexity of asdfjkl; qwerty               :   inf


## (c)  Naive  smoothing

In [6]:
for s in perplexity_samples:
    print(f"smoothed perplexity of {s:<30}: {lm.smoothed_perplexity(s, mylm, 4):10.2f}")

smoothed perplexity of The student loves homework    :       3.76
smoothed perplexity of The yob loves homework        :      37.28
smoothed perplexity of It is raining in London       :       3.06
smoothed perplexity of asdfjkl; qwerty               :  106907.17


## (d) Language Identification

In [8]:
from tqdm.notebook import tqdm

languages = ['da', 'de', 'en', 'fr', 'it', 'nl']
lang_files = {lang: f'assets/{lang}.train.txt' for lang in languages}

for order in [0, 2, 4]:
    models = {"char": None, "word": None}
    models["char"] = {lang: lm.train_char_lm(lang_file, order) for lang, lang_file in lang_files.items()}
    models["word"] = {lang: lm.train_word_lm(lang_file, order) for lang, lang_file in lang_files.items()}

    print(f"Order {order}")
    res_lang = {"char": {lang:[] for lang in languages}, "word": {lang:[] for lang in languages}}
    with open('assets/test.txt') as test_file:
        print(f"Testing...")
        for i, line in enumerate(tqdm(test_file, total=1200)):
            language, text = line.split('\t')
            for mode in ["char", "word"]:
                scores = {lang: lm.smoothed_perplexity(text, models[mode][lang], order, mode=mode) for lang in languages}
                predicted_language = min(scores, key=scores.get)
                if i == 0:
                    print("---", "first line", "---")
                    print(f"{mode} model:")
                    for lang in scores:
                        print(f"{lang:<3}-> {scores[lang]:10.2f}")
                    print("------------------")
                # Compare predicted_language with the actual language and count accuracy
                res_lang[mode][language].append(predicted_language == language)

    for mode in ["char", "word"]:
        print(f"Accuracy of {mode} model:")
        for lang in languages:
            print(f"{lang} -> {sum(res_lang[mode][lang]) / len(res_lang[mode][lang]):.2f}")



Order 0
Testing...


  0%|          | 0/1200 [00:00<?, ?it/s]

--- first line ---
char model:
da ->      29.32
de ->      29.52
en ->      31.44
fr ->      21.57
it ->      23.48
nl ->      26.63
------------------
--- first line ---
word model:
da ->  465944.01
de ->  443230.07
en ->  533520.54
fr ->    1237.39
it ->  159425.51
nl ->   95920.13
------------------
Accuracy of char model:
da -> 0.96
de -> 0.93
en -> 0.95
fr -> 0.97
it -> 0.98
nl -> 0.91
Accuracy of word model:
da -> 1.00
de -> 1.00
en -> 1.00
fr -> 1.00
it -> 1.00
nl -> 0.99
Order 2
Testing...


  0%|          | 0/1200 [00:00<?, ?it/s]

--- first line ---
char model:
da ->      51.43
de ->      45.13
en ->      35.40
fr ->       8.01
it ->      34.80
nl ->      19.05
------------------
--- first line ---
word model:
da -> 2610157.22
de -> 2610157.22
en -> 2610157.22
fr ->  183722.52
it -> 2610157.22
nl -> 2610157.22
------------------
Accuracy of char model:
da -> 1.00
de -> 1.00
en -> 1.00
fr -> 1.00
it -> 1.00
nl -> 0.99
Accuracy of word model:
da -> 0.97
de -> 0.90
en -> 0.97
fr -> 0.98
it -> 0.93
nl -> 0.91
Order 4
Testing...


  0%|          | 0/1200 [00:00<?, ?it/s]

--- first line ---
char model:
da ->  129237.70
de ->   46355.61
en ->   15225.68
fr ->       6.25
it ->   16784.92
nl ->    2581.62
------------------
--- first line ---
word model:
da ->  837677.64
de ->  837677.64
en ->  837677.64
fr ->  609733.15
it ->  837677.64
nl ->  837677.64
------------------
Accuracy of char model:
da -> 1.00
de -> 1.00
en -> 1.00
fr -> 1.00
it -> 1.00
nl -> 0.99
Accuracy of word model:
da -> 0.92
de -> 0.77
en -> 0.80
fr -> 0.83
it -> 0.75
nl -> 0.81


### Results:

| Model Type | n-gram Order | da   | de   | en   | fr   | it   | nl   |
|------------|--------------|------|------|------|------|------|------|
| Char       | 0            | 0.96 | 0.93 | 0.95 | 0.97 | 0.98 | 0.91 |
| Char       | 2            | 1.00 | 1.00 | 1.00 | 1.00 | 1.00 | 0.99 |
| Char       | 4            | 1.00 | 1.00 | 1.00 | 1.00 | 1.00 | 0.99 |
| Word       | 0            | 1.00 | 1.00 | 1.00 | 1.00 | 1.00 | 0.99 |
| Word       | 2            | 0.97 | 0.90 | 0.97 | 0.98 | 0.93 | 0.91 |
| Word       | 4            | 0.92 | 0.77 | 0.80 | 0.83 | 0.75 | 0.81 |

### Analysis:

1. Order 0 (Unigram): The word model outperforms the character model across all languages, reaching near-perfect accuracy.
2. Order 2 (Trigram): The character model reaches near-perfect accuracy for all languages. The word model's performance drops slightly for some languages, particularly German (de) and Dutch (nl).
3. Order 4 (5-gram): The character model maintains near-perfect accuracy. However, the word model's performance further declines for all languages.

The higher accuracy of character models with increasing n-grams suggests they might be more resilient to data sparsity issues compared to word models at higher n-gram orders. The word model's decreasing accuracy at higher orders indicates challenges with overfitting or handling unseen word combinations.


## (e) Gender Bias

In [12]:
genders_train_file = "assets/tennis.train.txt"
genders_test_file = "assets/tennis.test.txt"
genders = ["M", "F"]
genders_files = {"M": "assets/tennis.train.m.txt", "F": "assets/tennis.train.f.txt"}

# Open the original file for reading
with open(genders_train_file , 'r') as file:
    male_sentences = []
    female_sentences = []

    # Iterate through each line in the file
    for line in file:
        # Split the line into class and sentence based on the tab character
        cls, sentence = line.strip().split('\t')

        # Check the class and append the sentence to the corresponding list
        # after converting it to lowercase
        if cls == 'M':
            male_sentences.append(sentence.lower())
        elif cls == 'F':
            female_sentences.append(sentence.lower())

# Write male sentences to a new file
with open(genders_files["M"], "w") as male_file:
    for sentence in male_sentences:
        male_file.write(sentence + '\n')

# Write female sentences to a new file
with open(genders_files["F"], "w") as female_file:
    for sentence in female_sentences:
        female_file.write(sentence + '\n')


for order in [0, 2, 4]:
    models = {"char": None, "word": None}
    models["char"] = {gender: lm.train_char_lm(gender_file, order) for gender, gender_file in genders_files.items()}
    models["word"] = {gender: lm.train_word_lm(gender_file, order) for gender, gender_file in genders_files.items()}

    print(f"Order {order}")
    res_gender = {"char": {gender:[] for gender in genders}, "word": {gender:[] for gender in genders}}
    with open(genders_test_file) as test_file:
        print(f"Testing...")
        for i, line in enumerate(tqdm(test_file, total=8214)):
            gender, text = line.split('\t')
            for mode in ["char", "word"]:
                scores = {gender: lm.smoothed_perplexity(text, models[mode][gender], order, mode=mode) for gender in genders}
                predicted_gender = min(scores, key=scores.get)
                if i == 0:
                    print("---", "first line", "---")
                    print(f"{mode} model:")
                    for gender in scores:
                        print(f"{gender:<3}-> {scores[gender]:10.2f}")
                    print("------------------")
                # Compare predicted_language with the actual language and count accuracy
                res_gender[mode][gender].append(predicted_gender == gender)

    for mode in ["char", "word"]:
        print(f"Accuracy of {mode} model:")
        for gender in genders:
            print(f"{gender} -> {sum(res_gender[mode][gender]) / len(res_gender[mode][gender]):.2f}")


Order 0
Testing...


  0%|          | 0/8214 [00:00<?, ?it/s]

--- first line ---
char model:
M  ->      25.65
F  ->      25.62
------------------
--- first line ---
word model:
M  ->     955.55
F  ->     940.71
------------------
Accuracy of char model:
M -> 0.45
F -> 0.61
Accuracy of word model:
M -> 0.57
F -> 0.68
Order 2
Testing...


  0%|          | 0/8214 [00:00<?, ?it/s]

--- first line ---
char model:
M  ->      18.30
F  ->      18.00
------------------
--- first line ---
word model:
M  ->   12420.65
F  ->   12979.05
------------------
Accuracy of char model:
M -> 0.58
F -> 0.71
Accuracy of word model:
M -> 0.63
F -> 0.50
Order 4
Testing...


  0%|          | 0/8214 [00:00<?, ?it/s]

--- first line ---
char model:
M  ->      30.97
F  ->      34.85
------------------
--- first line ---
word model:
M  ->  256502.09
F  ->  256502.09
------------------
Accuracy of char model:
M -> 0.64
F -> 0.60
Accuracy of word model:
M -> 0.71
F -> 0.33


### Results

| Model Type | n-gram Order | M    | F    |
|------------|--------------|------|------|
| Char       | 0            | 0.45 | 0.61 |
| Char       | 2            | 0.58 | 0.71 |
| Char       | 4            | 0.64 | 0.60 |
| Word       | 0            | 0.57 | 0.68 |
| Word       | 2            | 0.63 | 0.50 |
| Word       | 4            | 0.71 | 0.33 |

### Analysis

1. N-gram Order 0:
Char model favors 'F' (0.61) over 'M' (0.45).
Word model also leans towards 'F' (0.68) compared to 'M' (0.57).
2. N-gram Order 2:
Char model improves for both genders with 'F' leading (0.71 vs 0.58).
Word model now prefers 'M' (0.63) over 'F' (0.50).
3. N-gram Order 4:
Char model shows 'M' and 'F' near parity (0.64 vs 0.60).
Word model heavily favors 'M' (0.71) with a significant drop for 'F' (0.33).

### Inferences
1. Word model displays varying biases for 'M' and 'F' as n-gram order changes, possibly suggesting overfitting.
2. Char model gives more consistent results between genders but might lack nuances captured by the word model.
3. Higher n-gram orders don't guarantee better accuracy, as seen with 'F' in the word model.


## Appendix

belows are the code that is implemtened in the charlm.py file


```python
def train_word_lm(fname, order=1):
    """Train a word-level language model using n-grams from a given file.

    Args:
    - fname (str): Name of the file containing training data.
    - order (int): The n-gram order for the language model.

    Returns:
    - dict: The trained language model.
    """

    # Open and read the file content
    with open(fname, 'r') as f:
        data = f.read()

    # Tokenize the data into sentences
    sentences = sent_tokenize(data)

    # Tokenize each sentence into words
    sents = [word_tokenize(sentence) for sentence in sentences]

    lm = defaultdict(Counter)
    
    # Use a special padding word "<PAD>" to pad the sentences.
    # This helps in managing the start of sentences.
    pad = ["<PAD>"] * order
    for sent in sents:
        # Add padding to the start and a special "<END>" token to the end of each sentence
        sent = pad + sent + ["<END>"]
        
        # Loop through each word in the sentence and create n-grams
        for i in range(len(sent) - order):
            history, word = tuple(sent[i:i+order]), sent[i+order]
            # Update the counts for this n-gram in the language model
            lm[history][word] += 1

    # Normalize the counts to get probabilities
    outlm = {hist: normalize(chars) for hist, chars in lm.items()}
    
    return outlm


def perplexity(text, lm, order=4, mode="char"):
    """Compute the perplexity of a given text using an input language model (LM).

    Args:
    - text (str): The input text for which perplexity is calculated.
    - lm (dict): The language model.
    - order (int): The n-gram order.
    - mode (str): Mode of operation - either 'char' for character-level or 'word' for word-level.

    Returns:
    - float: The perplexity of the input text.
    """

    # Choose padding based on mode (character or word)
    if mode == "char":
        pad = "~" * order
        data = pad + text
    elif mode == "word":
        pad = ["<PAD>"] * order
        data = pad + word_tokenize(text) + ["<END>"]
    else:
        raise ValueError("Invalid mode. Choose 'char' or 'word'.")

    log_prob = 0
    for i in range(len(data) - order):
        if mode == "char":
            history, char = data[i:i+order], data[i+order]
        elif mode == "word":
            history, char = tuple(data[i:i+order]), data[i+order]
        
        # Check if the character or word is in the LM for the given history
        if char in [ch for ch, _ in lm[history]]:
            prob = dict(lm[history])[char]
            log_prob += log(prob)
        else:
            # Return infinity if probability isn't found in the model
            return float("inf")
            
    return exp(-log_prob / len(data))


def smoothed_perplexity(text, lm, order=4, mode="char"):
    """Compute the smoothed perplexity of a given text using an input LM.

    Args:
    - text (str): The input text for which perplexity is calculated.
    - lm (dict): The language model.
    - order (int): The n-gram order.
    - mode (str): Mode of operation - either 'char' for character-level or 'word' for word-level.

    Returns:
    - float: The smoothed perplexity of the input text.
    """
    
    # Choose padding based on mode (character or word)
    if mode == "char":
        pad = "~" * order
        data = pad + text
    elif mode == "word":
        pad = ["<PAD>"] * order
        data = pad + word_tokenize(text) + ["<END>"]
    else:
        raise ValueError("Invalid mode. Choose 'char' or 'word'.")

    log_prob = 0
    for i in range(len(data) - order):
        if mode == "char":
            history, char = data[i:i+order], data[i+order]
        elif mode == "word":
            history, char = tuple(data[i:i+order]), data[i+order]
        
        try:
            prob = dict(lm[history])[char]
        except KeyError:
            prob = 1.0e-7
        log_prob += log(prob)
            
    return exp(-log_prob / len(data))
```