## A simple language models demo

Using the lyrics of Taylor Swift

https://github.com/irenetrampoline/taylor-swift-lyrics/blob/master/all_tswift_lyrics.txt

NLTK functions used in this notebook
* `nltk.bigrams`, `nltk.bigrams`
* Also see: https://www.nltk.org/api/nltk.lm.html#module-nltk.lm

Updated (added after class): This notebook uses tuples as keys to the bigram counter object, note in the HW you are asked to build a two-level dictionary which will be more efficient. 

In [1]:
import nltk
import pandas as pd
import numpy as np

from csv import QUOTE_NONE
from collections import Counter
import re

from nltk import word_tokenize

# to avoid confusion with variabls named 'bigrams','trigrams'
from  nltk import bigrams as nltk_bigrams 
from nltk import trigrams as nltk_trigrams

# alternative: for adding <s>, </s> ourselves
#from nltk.lm.preprocessing import pad_both_ends 

from random import choices, seed, choice
seed(0)  # for reproducible results

In [2]:
# placeholder tokens for start and end of sentence/song-line
START_TOK = "<s>"
END_TOK = "</s>"

In [3]:
# read and clean song lines

with open("./all_tswift_lyrics.txt", "r") as f:
    # remove [.*] e.g. "[Chorus]"
    song_lines = [re.sub(r'\[.*\]','',line).lower().strip() for line in f.readlines()]

    # remove empty lines after cleaning
    song_lines = [line for line in song_lines if line != ""]


print("# lines:",len(song_lines))
song_lines[:10]

# lines: 7645


['he said the way my blue eyes shined',
 'put those georgia stars to shame that night',
 'i said, "that\'s a lie."',
 'just a boy in a chevy truck',
 "that had a tendency of gettin' stuck",
 'on back roads at night',
 'and i was right there beside him all summer long',
 'and then the time we woke up to find that summer gone',
 'but when you think tim mcgraw',
 'i hope you think my favorite song']

In [4]:
# for demo - removed punctuation with  tok.isalpha()

tokenized_texts = [[START_TOK]+[tok for tok in word_tokenize(line) if tok.isalpha()] +[END_TOK] 
                   for line in song_lines]

# alternative: use NLTP 'pad_both_ends` to add start/end symbols 
# list(pad_both_ends(['he', 'said', 'the', 'way', 'my', 'blue', 'eyes', 'shined',],n=2))

print(tokenized_texts[0])

['<s>', 'he', 'said', 'the', 'way', 'my', 'blue', 'eyes', 'shined', '</s>']


In [5]:
# check - do all sentences start with '<s>'?
set([toks[0] for toks in tokenized_texts])

{'<s>'}

In [6]:
# Demo: nltk 'bigrams' function - returns a generator, use 'list()' to see values
list(nltk_bigrams(tokenized_texts[0]))

[('<s>', 'he'),
 ('he', 'said'),
 ('said', 'the'),
 ('the', 'way'),
 ('way', 'my'),
 ('my', 'blue'),
 ('blue', 'eyes'),
 ('eyes', 'shined'),
 ('shined', '</s>')]

In [7]:
# Builing counters from lists of sequences

bigrams_by_text = [nltk_bigrams(token_arr) for token_arr in tokenized_texts]
trigrams_by_text = [nltk_trigrams(token_arr) for token_arr in tokenized_texts]

raw_bigram_counts = Counter([bigram for arr in bigrams_by_text for bigram in arr])
raw_trigram_counts = Counter([trigram for arr in trigrams_by_text for trigram in arr])

In [8]:
# Python note - tuples are hashable - can be used as keys for dictionaries and counters

some_bigram = ('<s>','he')
print(f"# of occurrences of {some_bigram} :",raw_bigram_counts[some_bigram])

some_trigram = ('<s>','he','said')
print(f"# of occurrences of {some_trigram}:",raw_trigram_counts[some_trigram])

# of occurrences of ('<s>', 'he') : 88
# of occurrences of ('<s>', 'he', 'said'): 6


In [9]:
# if can encode unigrams as tuple of length 1 if we need, e.g ('the',)

raw_unigram_counts = Counter([(tok,) for arr in tokenized_texts for tok in arr])

some_unigram_tuple = ('the',)   # notice comma
raw_unigram_counts[some_unigram_tuple]

1665

Note: the rest of this notebook should not be taken as a demo for HW

In [10]:
def get_effective_counts(raw_ngram_counts, alpha=0.5):
    """get effective counts based with some smoothing factor"""
    vocab = set(raw_ngram_counts.keys())
    v = len(vocab)
    m = raw_ngram_counts.total()
    effective_counts = {tok:((raw_ngram_counts[tok]+alpha)*(m/(m+v))) for tok in vocab}
    return effective_counts

unigram_eff_counts = get_effective_counts(raw_unigram_counts)
bigram_eff_counts = get_effective_counts(raw_bigram_counts)
trigram_eff_counts = get_effective_counts(raw_trigram_counts)

most_freq_trigrams =  sorted(trigram_eff_counts,  key=trigram_eff_counts.get, reverse=True)[:20]


In [11]:
for triple in most_freq_trigrams:
    printable_trigram = "_".join(triple)
    unsmooth_count = raw_trigram_counts[triple]
    smooth_count = trigram_eff_counts[triple]
    print(f"{printable_trigram:15} {unsmooth_count:4} {smooth_count:7.2f} ")

<s>_and_i        262  173.21 
<s>_but_i        101   66.97 
<s>_and_you       75   49.82 
la_la_la          56   37.28 
to_me_</s>        55   36.62 
i_i_i             53   35.30 
<s>_i_do          48   32.00 
<s>_do_you        47   31.34 
the_way_you       47   31.34 
<s>_and_it        44   29.36 
<s>_oh_oh         43   28.70 
shake_it_off      43   28.70 
oh_oh_</s>        41   27.38 
<s>_if_you        41   27.38 
<s>_i_was         41   27.38 
<s>_i_know        40   26.72 
i_shake_it        40   26.72 
i_do_know         40   26.72 
<s>_but_you       39   26.06 
now_we_got        38   25.40 


## Song line generation using Bigrams



In [12]:
current_token = START_TOK

# Step 1 - what are bigrams to start a sentence? 
start_bigram_counts = {bigram:count for bigram, count in bigram_eff_counts.items() if bigram[0] == START_TOK}
print("# unique bigrams:",len(start_bigram_counts))

print("Most likely words to start a song line")
most_likely_start_bigrams = sorted(start_bigram_counts.items(), key=lambda tup:tup[1], reverse=True)[:10]

for bigram, count in most_likely_start_bigrams:
    print(f"{str(bigram):15} \t {count:.2f}")

# unique bigrams: 670
Most likely words to start a song line
('<s>', 'i')    	 768.46
('<s>', 'and')  	 721.43
('<s>', 'you')  	 385.21
('<s>', 'but')  	 254.32
('<s>', 'it')   	 171.25
('<s>', 'oh')   	 132.06
('<s>', 'the')  	 124.22
('<s>', 'so')   	 123.44
('<s>', 'we')   	 105.41
('<s>', 'that') 	 77.98


In [13]:
# Step 2 - select w_1 at random

print('Generating 20 sample first-words for a new song line:')
for _ in range(20):

    # get list of all bigrams of form ('_START', <word> ) 
    possible_start_bigrams = list(start_bigram_counts.keys())

    # randomly select bigram, weighted by effective count
    bigram_weights = list(start_bigram_counts.values())  # weight based on count
    sample_initial_bigram = choices( possible_start_bigrams, weights=bigram_weights,)[0]
    
    print(sample_initial_bigram)

Generating 20 sample first-words for a new song line:
('<s>', 'i')
('<s>', 'and')
('<s>', 'are')
('<s>', 'my')
('<s>', 'we')
('<s>', 'somehow')
('<s>', 'did')
('<s>', 'do')
('<s>', 'praying')
('<s>', 'tell')
('<s>', 'i')
('<s>', 'the')
('<s>', 'drive')
('<s>', 'and')
('<s>', 'saying')
('<s>', 'in')
('<s>', 'i')
('<s>', 'ghosts')
('<s>', 'lantern')
('<s>', 'i')


In [14]:
# putting it together

def generate_line_from_bigrams(bigram_eff_counts:dict, max_len:int=100)->str:
    output_tokens = []
    current_token = START_TOK
    
    for _ in range(max_len+1):
        
        output_tokens.append(current_token)
        if current_token=='</s>':
            break

        # is this efficient??
        possible_bigram_data = {bigram:count for bigram, count in bigram_eff_counts.items() if bigram[0] == current_token}
        
        next_bigram = choices( list(possible_bigram_data.keys()), weights=list(possible_bigram_data.values()),)[0]
        
        current_token = next_bigram[1]
    return " ".join(output_tokens)
        

# generate 20 lines
for _ in range(20):
    print(generate_line_from_bigrams(bigram_eff_counts)+'\n')

<s> do what past gon na sail the rest in silent night holy night with no more i need me strike a nasty scar </s>

<s> well i was stay stay through again </s>

<s> you now </s>

<s> i met you are freethe way too soon enough </s>

<s> there </s>

<s> ca take it all the dead of this is a ticket on me somewhere </s>

<s> new yorknice to your eyes </s>

<s> i learned </s>

<s> but none of thirst </s>

<s> when i swear i play guitar </s>

<s> and you i i down </s>

<s> you walked in the upper hand </s>

<s> and somebody tells me that i go free </s>

<s> so where they do know is enough for you </s>

<s> my life lessons are is good girl faith </s>

<s> it feels to do </s>

<s> last eight times before your eyes off the air </s>

<s> knowing it hurts or break a movie you wonder about it a bit of stars </s>

<s> i never worse or another </s>

<s> take our whole thing to listen </s>



## Trying it with trigrams

In [15]:
def generate_line_from_trigrams(first_word, trigram_eff_counts, max_len=100):
    current_bigram = (START_TOK,first_word)
    
    output_tokens = [START_TOK]
    
    for _ in range(max_len):
        output_tokens.append(current_bigram[-1])
        
        #output_tokens.append(current_bigram[1])
        if  current_bigram[-1]==END_TOK:
            break
            
        # Step 1 - what are bigrams to start a sentence? 
        possible_trigram_data = {trigram:count for trigram, count in trigram_eff_counts.items() if trigram[:2] == current_bigram}
        next_trigram = choices( list(possible_trigram_data.keys()), weights=list(possible_trigram_data.values()),)[0]
        current_bigram = (next_trigram[1],next_trigram[2])
    
    return " ".join(output_tokens)

# note that the NLTK library gives us more elegant ways to handle trigrams, 4-grams, etc

In [16]:
# generate 20 lines
for _ in range(20):
    print(generate_line_from_trigrams("love", trigram_eff_counts)+'\n')

<s> love is a tough crowd </s>

<s> love like a baby </s>

<s> love is ours </s>

<s> love is different </s>

<s> love like a dream </s>

<s> love got me cornered in the rain outside i came out alive </s>

<s> love got me alone </s>

<s> love is ours </s>

<s> love got me feeling like i always get it </s>

<s> love got me feeling like i do wan na know you better now </s>

<s> love a fragile line </s>

<s> love got me tangled tired me too </s>

<s> love pure light </s>

<s> love is alive back from that soul vacation </s>

<s> love pure light </s>

<s> love a fragile line </s>

<s> love is alive back from your knife </s>

<s> love like this </s>

<s> love is glowing in the morning of your past thinking your future was me </s>

<s> love got me a light go on </s>



#### Possible improvments
* Use NLTL language model libraries
* Other smoothing methods 