<a href="https://colab.research.google.com/github/Sitraka17/Learning-Data-Science/blob/main/ACL2023_Tutorial_how_to_evaluate.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# How to generate text from a language model

In [None]:
!pip install -q git+https://github.com/huggingface/transformers.git
!pip install -q git+https://github.com/huggingface/evaluate.git
!pip install -q torch
!pip install -q mauve-text

  Installing build dependencies ... [?25l[?25hdone
  Getting requirements to build wheel ... [?25l[?25hdone
  Preparing metadata (pyproject.toml) ... [?25l[?25hdone
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m268.8/268.8 kB[0m [31m9.6 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.8/7.8 MB[0m [31m83.3 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.3/1.3 MB[0m [31m78.8 MB/s[0m eta [36m0:00:00[0m
[?25h  Building wheel for transformers (pyproject.toml) ... [?25l[?25hdone
  Preparing metadata (setup.py) ... [?25l[?25hdone
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m486.2/486.2 kB[0m [31m13.5 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m110.5/110.5 kB[0m [31m14.6 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m212.5/212.5 kB[0m [31m23.6 MB/s[

### Declare Constants

In [None]:
# Declare constants:
models = [
  'large-762M-p0.95',
  'large-762M',
  'medium-345M-p0.9',
  'medium-345M',
  'small-117M-p0.9',
  'small-117M',
  'xl-1542M-p0.95',
  'xl-1542M',
]

human_data_url = 'https://raw.githubusercontent.com/rycolab/clusters-in-language-evaluation/main/data/raw/webtext/human.test.jsonl'
model_data_url = {
    model: {
        seed: f'https://raw.githubusercontent.com/rycolab/clusters-in-language-evaluation/main/data/raw/webtext/seed_{seed}/{model}.test.jsonl'
        for seed in range(5)
    }
    for model in models
}
n_lines = 200

webtext_scores = {
    'sensible': {
        'small-117M-p0.9': -7.442,
        'small-117M': -37.805,
        'medium-345M-p0.9': -7.293,
        'medium-345M': -32.004,
        'large-762M-p0.95': 8.781,
        'large-762M': -7.106,
        'xl-1542M-p0.95': 31.888,
        'xl-1542M': 7.753,
        "human": 43.229,
    },
    'interesting': {
        'small-117M-p0.9': -0.697,
        'small-117M': -15.487,
        'medium-345M-p0.9': -12.824,
        'medium-345M': -34.323,
        'large-762M-p0.95': 6.785,
        'large-762M': -1.532,
        'xl-1542M-p0.95': 23.046,
        'xl-1542M': 9.529,
        "human": 25.503,
    },
    'human-like': {
        'small-117M-p0.9': -15.783,
        'small-117M': -27.518,
        'medium-345M-p0.9': -3.429,
        'medium-345M': -30.769,
        'large-762M-p0.95': 12.553,
        'large-762M': -6.935,
        'xl-1542M-p0.95': 15.664,
        'xl-1542M': 8.966,
        "human": 47.251,
    },
}

### Read data

Here, we assume we have a set of model's from which we previously generated text.

In [None]:
import json
import urllib.request  # the lib that handles the url stuff


def read_data(fpath):
    texts = []
    for line in urllib.request.urlopen(fpath):
        data = json.loads(line)
        texts += [data['text']]
        if len(texts) >= n_lines:
          break
    return texts


In [None]:
texts_human = read_data(human_data_url)
texts_models = {model: read_data(model_urls[0]) for model, model_urls in model_data_url.items()}

### Evaluate Models with Mauve

In [None]:
from evaluate import load
from tqdm.notebook import tqdm
mauve = load('mauve')

webtext_scores['mauve'] = {}
for model, predictions in tqdm(texts_models.items(), desc='Iterating through models'):
  mauve_results = mauve.compute(predictions=predictions, references=texts_human, featurize_model_name='gpt2', device_id=0, verbose=False)
  webtext_scores['mauve'][model] = mauve_results.mauve

Downloading builder script:   0%|          | 0.00/6.63k [00:00<?, ?B/s]

Iterating through models:   0%|          | 0/8 [00:00<?, ?it/s]

Downloading (…)lve/main/config.json:   0%|          | 0.00/665 [00:00<?, ?B/s]

Downloading (…)olve/main/vocab.json:   0%|          | 0.00/1.04M [00:00<?, ?B/s]

Downloading (…)olve/main/merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

Downloading (…)/main/tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

Downloading model.safetensors:   0%|          | 0.00/548M [00:00<?, ?B/s]

Featurizing p:   0%|          | 0/200 [00:00<?, ?it/s]

Featurizing q:   0%|          | 0/200 [00:00<?, ?it/s]

Featurizing p:   0%|          | 0/200 [00:00<?, ?it/s]

Featurizing q:   0%|          | 0/200 [00:00<?, ?it/s]

Featurizing p:   0%|          | 0/200 [00:00<?, ?it/s]

Featurizing q:   0%|          | 0/200 [00:00<?, ?it/s]

Featurizing p:   0%|          | 0/200 [00:00<?, ?it/s]

Featurizing q:   0%|          | 0/200 [00:00<?, ?it/s]

Featurizing p:   0%|          | 0/200 [00:00<?, ?it/s]

Featurizing q:   0%|          | 0/200 [00:00<?, ?it/s]

Featurizing p:   0%|          | 0/200 [00:00<?, ?it/s]

Featurizing q:   0%|          | 0/200 [00:00<?, ?it/s]

Featurizing p:   0%|          | 0/200 [00:00<?, ?it/s]

Featurizing q:   0%|          | 0/200 [00:00<?, ?it/s]

Featurizing p:   0%|          | 0/200 [00:00<?, ?it/s]

Featurizing q:   0%|          | 0/200 [00:00<?, ?it/s]

### Show Results

In [None]:
import pandas as pd

df = pd.DataFrame(webtext_scores)
df

Unnamed: 0,sensible,interesting,human-like,mauve
small-117M-p0.9,-7.442,-0.697,-15.783,0.972505
small-117M,-37.805,-15.487,-27.518,0.914673
medium-345M-p0.9,-7.293,-12.824,-3.429,0.93765
medium-345M,-32.004,-34.323,-30.769,0.698784
large-762M-p0.95,8.781,6.785,12.553,0.944079
large-762M,-7.106,-1.532,-6.935,0.9594
xl-1542M-p0.95,31.888,23.046,15.664,0.943348
xl-1542M,7.753,9.529,8.966,0.950983
human,43.229,25.503,47.251,


In [None]:
df.drop('human').corr('pearson')

Unnamed: 0,sensible,interesting,human-like,mauve
sensible,1.0,0.911209,0.943513,0.533497
interesting,0.911209,1.0,0.878977,0.749329
human-like,0.943513,0.878977,1.0,0.586823
mauve,0.533497,0.749329,0.586823,1.0
