# Set up

In [None]:
!git clone https://github.com/NLP-Reichman/assignment_1.git
!mv assignment_1/data data
!rm assignment_1/ -r

Cloning into 'assignment_1'...
remote: Enumerating objects: 150, done.[K
remote: Counting objects: 100% (44/44), done.[K
remote: Compressing objects: 100% (22/22), done.[K
remote: Total 150 (delta 35), reused 22 (delta 22), pack-reused 106[K
Receiving objects: 100% (150/150), 6.79 MiB | 14.24 MiB/s, done.
Resolving deltas: 100% (64/64), done.


# Introduction
In this assignment you will be creating tools for learning and testing language models. The corpora that you will be working with are lists of tweets in 8 different languages that use the Latin script. The data is provided either formatted as CSV or as JSON, for your convenience. The end goal is to write a set of tools that can detect the language of a given tweet.
The relevant files are under the data folder:

- en.csv (or the equivalent JSON file)
- es.csv (or the equivalent JSON file)
- fr.csv (or the equivalent JSON file)
- in.csv (or the equivalent JSON file)
- it.csv (or the equivalent JSON file)
- nl.csv (or the equivalent JSON file)
- pt.csv (or the equivalent JSON file)
- tl.csv (or the equivalent JSON file)

In [None]:
import json
from google.colab import files
import pandas as pd

# Implementation

In [None]:
import os
import itertools
from collections import defaultdict
import numpy as np



## Part 1
Implement the function *preprocess* that iterates over all the data files and creates a single vocabulary, containing all the tokens in the data. Our token definition is a single UTF-8 encoded character. So, the vocabulary list is a simple Python list of all the characters that you see at least once in the data.

Note - do NOT lowercase the sentences in whi HW.

In [None]:
def preprocess() -> list[str]:
  '''
  Return a list of characters, representing the shared vocabulary of all languages
  '''

  tokens = set()
  for filename in os.listdir("./data"):
    if filename.endswith(".csv"):
      tweets = pd.read_csv("./data/" + filename, usecols=['tweet_text'], encoding='utf-8')
      chars = tweets['tweet_text'].apply(lambda x: {c for c in x})
      tokens = tokens.union(*chars)

  tokens.add('ה') # start token 'ה' for 'התחלה'
  tokens.add('ס') # end token 'ס' for 'סוף'

  return list(set(tokens))

## Part 2
Implement the function *lm* that generates a language model from a textual corpus. The function should return a dictionary (representing a model) where the keys are all the relevant *n*-1 sequences, and the values are dictionaries with the *n*_th tokens and their corresponding probabilities to occur. For example, for a trigram model (tokens are characters), it should look something like:

{ "ab":{"c":0.5, "b":0.25, "d":0.25}, "ca":{"a":0.2, "b":0.7, "d":0.1} }

which means for example that after the sequence "ab", there is a 0.5 chance that "c" will appear, 0.25 for "b" to appear and 0.25 for "d" to appear.

Note - You should think how to add the add_one smoothing information to the dictionary and implement it.

In [None]:
def lm(lang: str, n: int, smoothed: bool = False) -> dict[str, dict[str, float]]:
  '''
  Return a language model for the given lang and n-gram (n), with an option for smoothing.
  :param lang: the language of the model
  :param n: the n_gram value
  :param smoothed: boolean indicating whether to apply smoothing
  :return: a dictionary where the keys are n-1 grams and the values are dictionaries
  '''

  # Initialize a vocabulary
  vocab = preprocess()



  vocab_size = len(vocab)


  # Read the tweets according to the input language
  tweets = pd.read_csv("./data/" + lang + ".csv", usecols=['tweet_text'], encoding='utf-8')
  total_length = 0


  # Initialize a model as dicationary. handle the case where n = 1:
  if n == 1:
    model = defaultdict(lambda: 0.0)

  else:
    model = defaultdict(lambda: defaultdict(lambda: 0.0))


  # Iterate over each tweet
  for tweet in tweets['tweet_text']:


    text = ''.join(['ה']*(n-1) + [c for c in tweet] + ['ס'])

    # Handle the unigram case separately - count the appearance of each character in the vocabulary
    if n == 1:
      for char in text:
        model[char] += 1
        total_length += 1

    # for n>1 slice any n gram, and save in the model as key the n-1 gram, count the number of appearances of each n character
    else:


      for i in range(len(text) - n + 1):
        seq = text[i:i+n-1]
        next_char = text[i+n-1]
        model[seq][next_char] += 1

  # Apply add-one smoothing
  if smoothed:
    # For the unigram case apply the smoothing by add 1 and devide by vocabulary size
    if n == 1:
      for char in vocab:
        model[char] += 1
        model[char] /= total_length + vocab_size + 1

      # Add 1 for unkown
      model['ל'] = 1 / (total_length + vocab_size + 1)

    else:
      # Iterate over every key in the model for smoothing the values

      for seq in model:
          # Sum the values for the denominator
          total_count = sum(model[seq].values())

          # Recalculate the probabiltiy (the smoothing)
          model[seq] = {char: (model[seq].get(char, 0) + 1) / (total_count + vocab_size) for char in vocab}

      # Add unknown key with all vocabulray as potential character
      model['ל'] = {char: 1 / vocab_size for char in vocab}


  # Without smoothing
  else:

    # For the unigram case estimate the probabilty of each character
    if n == 1:
      # for char in vocab:
      for char in model.keys():
        model[char] /= total_length


    else:
      # Iterate over every n-1 gram and n'th token
      for seq, next_chars in model.items():

        # Count the number of appearances of the n-1 gram
        seq_count = sum(next_chars.values())

        # Iterate over every character in the Vocabulary and apply the smoothing (add 1 and devide by the vocabulary size)
        # for char in vocab: (in case we want zeros)
        # Add only existing probabilities
        for char in next_chars:
          model[seq][char] /= seq_count

  return dict(model)

## Part 3
Implement the function *eval* that returns the perplexity of a model (dictionary) running over the data file of the given target language.

In [None]:
def eval(model: dict, target_lang: str, n: int) -> float:
  '''
  Return the perplexity value calculated over applying the model on the text file
  of the target_lang language.
  :param model: the language model
  :param target_lang: the target language
  :param n: The n-gram of the model
  :return: the perplexity value
  '''

  perplexities = []

  # List of keys - n-1 grams
  model_sequences = list(model)

  # Read the tweets
  tweets = pd.read_csv("./data/" + target_lang + ".csv", usecols=['tweet_text'], encoding='utf-8')

  # Iterate over all the tweets
  for text in tweets['tweet_text']:

    # List of logs of probabilities
    tweet_probs = []

    # For the unigram case iterate over every character in the text
    if n == 1:
      for char in text:

        # For any character that is not in the text, add the probability of unknown character
        if char not in model_sequences:
          tweet_probs.append(np.log2(model['ל']))

        # Add the propability of the current char
        else:
          tweet_probs.append(np.log2(model[char]))


    # For n>1
    else:
      # Iterate over any possible probability  and add the log of the probability
      tweet_probs.extend( np.log2(model['ל'][text[i+n-1]] if text[i:i+n-1] not in model_sequences else model[text[i:i+n-1]][text[i+n-1]])
      for i in range(len(text) - n + 1))

    # Calculate the entropy of the tweet
    tweet_entropy = - np.mean(tweet_probs)

    # Calculate the perplexity
    perplexities.append(2 ** tweet_entropy)



  return np.mean(perplexities)


## Part 4
Implement the *match* function that calls *eval* using a specific value of *n* for every possible language pair among the languages we have data for. You should call *eval* for every language pair four times, with each call assign a different value for *n* (1-4). Each language pair is composed of the source language and the target language. Before you make the call, you need to call the *lm* function to create the language model for the source language. Then you can call *eval* with the language model and the target language. The function should return a pandas DataFrame with the following four columns: *source_lang*, *target_lang*, *n*, *perplexity*. The values for the first two columns are the two-letter language codes. The value for *n* is the *n* you use for generating the specific perplexity values which you should store in the forth column.

In [None]:
def match() -> pd.DataFrame:
  '''
  Return a DataFrame containing one line per every language pair and n_gram.
  Each line will contain the perplexity calculated when applying the language model
  of the source language on the text of the target language.
  :return: a DataFrame containing the perplexity values
  '''
  columns = ['source', 'target', 'n', 'perplexity']
  langs = ['en', 'fr', 'pt', 'es', 'tl', 'nl', 'in', 'it']

  ns = [1,2,3,4]

  results = []

  # Iterate every n
  for n in ns:
    # Iterate over every permutation of 2 languages
    for source in langs:
      # Generate a model for the current source language
      model = lm(source, n, smoothed=True)

      for target in langs:

      # Calculate the perplexity
        perplexity = eval(model, target, n)

        # Append the current permutation to the dataframe
        results.append({'source': source, 'target': target, 'n': n, 'perplexity': perplexity})
        print(f'source: {source}, target: {target}, n: {n}, perplexity: {perplexity}')


  df = pd.DataFrame(results, columns=columns)
  return df

## Part 5
Implement the *generate* function which takes a language code, *n*, the prompt (the starting text), the number of tokens to generate, and *r*, which is the random seed for any randomized action you plan to take in your implementation. The function should start generating tokens, one by one, using the language model of the given source language and *n*. The prompt should be used as a starting point for aligning on the probabilities to be used for generating the next token.

Note - The generation of the next token should be from the LM's distribution.

In [None]:
def generate(lang: str, n: int, prompt: str, number_of_tokens: int, r: int) -> str:
  '''
  Generate text in the given language using the given parameters.
  :param lang: the language of the model
  :param n: the n_gram value
  :param prompt: the prompt to start the generation
  :param number_of_tokens: the number of tokens to generate
  :param r: the random seed to use
  '''
  # set the random seed
  np.random.seed(r)
  # create the language model
  model = lm(lang, n, smoothed=False)
  # generate the text
  text = prompt

  # For the unigrams case (different type of model)
  if n==1:
    for i in range(number_of_tokens):

      # generate the next token
      token = np.random.choice(list(model), p=list(model.values()))

      # add the token to the text
      text += token

  else:
    for i in range(number_of_tokens):

      # generate the next token
      # If the n-1 gram is not in the current model, use the unkown key to generate the next character
      if text[(-n+1):] not in model.keys():
        token = np.random.choice(list(model['ל']), p=list(model['ל'].values()))
        print (f'{text[(-n+1):]} not in model ')
      # Generate character according to the n-1 gram probabilities
      else:
        token = np.random.choice(list(model[text[(-n+1):]]), p=list(model[text[(-n+1):]].values()))

      # add the token to the text
      text += token
  return text

## Part 6
Play with your generate function, try to generate different texts in different language and various values of *n*. No need to submit anything of that.

In [None]:
print(generate('en', 1, "I", 20, 5))
print(generate('en', 2, "I am", 20, 5))
print(generate('en', 3, "I am", 20, 5))
print(generate('en', 4, "I Love", 20, 5))
print(generate('es', 2, "Soy", 20, 5))
print(generate('es', 3, "Soy", 20, 5))
print(generate('fr', 2, "Je suis", 20, 5))
print(generate('fr', 3, "Je suis", 20, 5))

# Testing

Copy the content of the **tests.py** file from the repo and paste below. This will create the results.json file and download it to your machine.

In [None]:
# Create tests
def test_preprocess():
    return {
        'vocab_length': len(preprocess()),
    }

def test_lm():
    return {
        'english_2_gram_length': len(lm('en', 2, True)),
        'english_3_gram_length': len(lm('en', 3, True)),
        'french_3_gram_length': len(lm('fr', 3, True)),
        'spanish_3_gram_length': len(lm('es', 3, True)),
    }

def test_eval():
    return {
        'en_en': eval(lm('en', 3, True), 'en', 3),
        'en_fr': eval(lm('en', 3, True), 'fr', 3),
        'en_tl': eval(lm('en', 3, True), 'tl', 3),
        'en_nl': eval(lm('en', 3, True), 'nl', 3),
    }

def test_match():
    df = match()
    return {
        'en_en_3': df[(df['source'] == 'en') & (df['target'] == 'en') & (df['n'] == 3)]['perplexity'].values[0],
        'en_tl_3': df[(df['source'] == 'en') & (df['target'] == 'tl') & (df['n'] == 3)]['perplexity'].values[0],
        'en_nl_3': df[(df['source'] == 'en') & (df['target'] == 'nl') & (df['n'] == 3)]['perplexity'].values[0],
    }

def test_generate():
    return {
        'english_1_gram': generate('en', 1, "I", 20, 5),
        'english_2_gram': generate('en', 2, "I am", 20, 5),
        'english_3_gram': generate('en', 3, "I am", 20, 5),
        'english_4_gram': generate('en', 4, "I Love", 20, 5),
        'spanish_2_gram': generate('es', 2, "Soy", 20, 5),
        'spanish_3_gram': generate('es', 3, "Soy", 20, 5),
        'french_2_gram': generate('fr', 2, "Je suis", 20, 5),
        'french_3_gram': generate('fr', 3, "Je suis", 20, 5),
    }

TESTS = [test_preprocess, test_lm, test_eval, test_match, test_generate]

# Run tests and save results
res = {}
for test in TESTS:
    try:
        cur_res = test()
        res.update({test.__name__: cur_res})
    except Exception as e:
        res.update({test.__name__: repr(e)})

with open('results.json', 'w') as f:
    json.dump(res, f, indent=2)

# Download the results.json file
files.download('results.json')

In [None]:
# Show the local files, results.json should be there now and
# also downloaded to your local machine
!ls -l