## 0. Installments and Downloads


In [0]:
# download the data from google drive and unzip 
!gdown https://drive.google.com/uc?id=1OYcrdW8VdQrNQ-P0d8UotHx5QmY7TQaY
!unzip 'previous material.zip'

Downloading...
From: https://drive.google.com/uc?id=1OYcrdW8VdQrNQ-P0d8UotHx5QmY7TQaY
To: /content/previous material.zip
73.1MB [00:00, 102MB/s] 
Archive:  previous material.zip
   creating: data/
 extracting: Taylor_lyrics.ipynb     
 extracting: data/songdata.csv       
 extracting: data/tweet_sample.csv   
 extracting: data/Taylor_lyrics.ipynb  
 extracting: data/taylor_swift_lyrics.csv  
 extracting: data/processed_pop_sample.csv  
 extracting: data/taylor_swift_lyrics_sample.csv  


## 1. Data Loading and pre-processing

In [0]:
import numpy as np
import pandas as pd
import codecs
import nltk
nltk.download('punkt')
from nltk.tokenize import sent_tokenize, word_tokenize
from nltk.probability import ConditionalFreqDist
from nltk.util import ngrams
from scipy import stats

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


In [0]:
# import the dataset of taylor swift lyrics
dataset = pd.read_csv('data/taylor_swift_lyrics.csv', encoding='latin1')
dataset.head()

Unnamed: 0,artist,album,track_title,track_n,lyric,line,year
0,Taylor Swift,Taylor Swift,Tim McGraw,1,He said the way my blue eyes shined,1,2006
1,Taylor Swift,Taylor Swift,Tim McGraw,1,Put those Georgia stars to shame that night,2,2006
2,Taylor Swift,Taylor Swift,Tim McGraw,1,"I said, ""That's a lie""",3,2006
3,Taylor Swift,Taylor Swift,Tim McGraw,1,Just a boy in a Chevy truck,4,2006
4,Taylor Swift,Taylor Swift,Tim McGraw,1,That had a tendency of gettin' stuck,5,2006


In [0]:
# sort out all songs & merge lyrics from the same song
merged_dataset = dataset.groupby(by=['track_title'])['lyric'].agg(lambda x: '. '.join(x))
merged_dataset = pd.DataFrame(merged_dataset).reset_index()
merged_dataset.head(10)

Unnamed: 0,track_title,lyric
0,...Ready for It?,Knew he was a killer first time that I saw him...
1,22,It feels like a perfect night to dress up like...
2,A Perfectly Good Heart,Why would you wanna break a perfectly good hea...
3,A Place In This World,"I don't know what I want, so don't ask me. Cau..."
4,All Too Well,"I walked through the door with you, the air wa..."
5,All You Had to Do Was Stay,People like you always want back. The love the...
6,Back To December,I'm so glad you made time to see me. How's lif...
7,Bad Blood,"'Cause baby, now we've got bad blood. You know..."
8,Begin Again,Took a deep breath in the mirror. He didn't li...
9,Better Than Revenge,Now go stand in the corner and think about wha...


## N-gram Model

In [0]:
sentences = [sent+'.' for sent in dataset['lyric'].values]

In [0]:
# dealing with format, such as punctuation
def deal_with_punct(words):
  punctuations = {'.': '. ', ',': ', ',';': '; ',':': ': ','...': '... ',
           '?': '? ', '!': '! ','(': ' (', ')': ') ', '[': ' [',
           ']':  '] ', '``':' "', "''": '" '}
  ans = ' '
  for word in words:
    if word in punctuations:
      punct = punctuations[word]
      if ans[-1]==' ':
        ans = ans[:-1] + punct
      else:
        ans += punct
    else:
      ans += word + ' '
  return ans[:-1]

In [0]:
# next word prediction based on frequencies
def nextword(ngram, cfdist=None):
  xk = np.arange(cfdist[ngram].B())
  pk = []
  candidate = []
  for next_word in cfdist[ngram]:
    candidate.append(next_word)
    pk.append(cfdist[ngram].freq(next_word))

  custm = stats.rv_discrete(values=(xk,pk))
  return candidate[custm.rvs()]

In [0]:
# generating lyrics
def generate_lyrics(sentences, n, num_sents):
  tok_sents = [[None for _ in range(n-1)] + word_tokenize(sent) for sent in sentences]
  words = []
  for tok_sent in tok_sents:
    words += tok_sent
  words += [None]
  ngrams_ = ngrams(words, n)
  ngrams_cfd = ConditionalFreqDist((ngram[:-1], ngram[-1]) for ngram in ngrams_)

  text_input = tuple(None for _ in range(n-1))
  text_gen = []
  num_sents_gen = 0
  while True:
    next_word = nextword(text_input, ngrams_cfd)
    if not next_word:
      num_sents_gen += 1
      if num_sents and num_sents==num_sents_gen:
        break
      text_input = tuple(None for _ in range(n-1))
      continue

    text_input += (next_word,) 
    text_input = text_input[1:]
    text_gen.append(next_word)
  return deal_with_punct(text_gen)

In [0]:
size = 15
outputs = []
for i in range(size):
  outputs.append(generate_lyrics(sentences, 4, num_sents=1))

outputs

[" We found Wonderland, you and I swear I do n't wan na touch you, I run and run.",
 " 'Cause it 's late and your mama do n't know how my friends could be so mean.",
 " And I would 've been so happy.",
 ' Had me in the eye and told me you loved me.',
 ' I just want to know you better now.',
 ' Dating the boy on the football team.',
 ' Cause we never go out of style.',
 " 'Cause all I know is.",
 ' Are you ready for it?.',
 ' On all my wasted time.',
 ' And now you say\x97.',
 ' And everybody knows that.',
 " My baby 's fit like a daydream.",
 ' I am not the kind of girl.',
 " And I do n't know what to say."]

## Finetune-GPT2 Model

Here we basically used pre-trained GPT-2 model to transfer learning on our task. The model is simply for comparing results.

In [0]:
!gdown https://drive.google.com/uc?id=188gwThRiEuAXNIkbaKLjOktJwZvEnIt5
!unzip new_finetune.zip

!gdown https://drive.google.com/uc?id=1EPzPWHAsuAfoRpsA3d4qA-ogJyLf-7G0  

Downloading...
From: https://drive.google.com/uc?id=188gwThRiEuAXNIkbaKLjOktJwZvEnIt5
To: /content/new_finetune.zip
463MB [00:03, 118MB/s]
Archive:  new_finetune.zip
   creating: new_finetune/
  inflating: new_finetune/tokenizer_config.json  
  inflating: new_finetune/special_tokens_map.json  
  inflating: new_finetune/config.json  
  inflating: new_finetune/merges.txt  
  inflating: __MACOSX/new_finetune/._merges.txt  
  inflating: new_finetune/training_args.bin  
  inflating: new_finetune/pytorch_model.bin  
  inflating: __MACOSX/new_finetune/._pytorch_model.bin  
  inflating: new_finetune/vocab.json  
  inflating: new_finetune/eval_results.txt  
  inflating: __MACOSX/new_finetune/._eval_results.txt  
Downloading...
From: https://drive.google.com/uc?id=1EPzPWHAsuAfoRpsA3d4qA-ogJyLf-7G0
To: /content/run_generation.py
100% 10.1k/10.1k [00:00<00:00, 25.1MB/s]


In [0]:
!python -m pip install transformers

Collecting transformers
[?25l  Downloading https://files.pythonhosted.org/packages/a3/78/92cedda05552398352ed9784908b834ee32a0bd071a9b32de287327370b7/transformers-2.8.0-py3-none-any.whl (563kB)
[K     |████████████████████████████████| 573kB 2.7MB/s 
Collecting sacremoses
[?25l  Downloading https://files.pythonhosted.org/packages/7d/34/09d19aff26edcc8eb2a01bed8e98f13a1537005d31e95233fd48216eed10/sacremoses-0.0.43.tar.gz (883kB)
[K     |████████████████████████████████| 890kB 9.6MB/s 
Collecting sentencepiece
[?25l  Downloading https://files.pythonhosted.org/packages/98/2c/8df20f3ac6c22ac224fff307ebc102818206c53fc454ecd37d8ac2060df5/sentencepiece-0.1.86-cp36-cp36m-manylinux1_x86_64.whl (1.0MB)
[K     |████████████████████████████████| 1.0MB 19.8MB/s 
[?25hCollecting tokenizers==0.5.2
[?25l  Downloading https://files.pythonhosted.org/packages/d1/3f/73c881ea4723e43c1e9acf317cf407fab3a278daab3a69c98dcac511c04f/tokenizers-0.5.2-cp36-cp36m-manylinux1_x86_64.whl (3.7MB)
[K     |█████

In [0]:
# change input sentence here to generate different output 
sentence = "She's not a saint and she's not what you think She's an actress, whoa She's better known for the things that she does On the mattress, whoa"

change the value of "length" will change the length of output

In [0]:
!python3 run_generation.py \
    --model_type=gpt2 \
    --length=25 \
    --model_name_or_path='./new_finetune' \
    --prompt="$sentence"

2020-05-07 02:55:46.232416: I tensorflow/stream_executor/platform/default/dso_loader.cc:44] Successfully opened dynamic library libcudart.so.10.1
05/07/2020 02:55:48 - INFO - transformers.tokenization_utils -   Model name './new_finetune' not found in model shortcut name list (gpt2, gpt2-medium, gpt2-large, gpt2-xl, distilgpt2). Assuming './new_finetune' is a path, a model identifier, or url to a directory containing tokenizer files.
05/07/2020 02:55:48 - INFO - transformers.tokenization_utils -   Didn't find file ./new_finetune/added_tokens.json. We won't load it.
05/07/2020 02:55:48 - INFO - transformers.tokenization_utils -   loading file ./new_finetune/vocab.json
05/07/2020 02:55:48 - INFO - transformers.tokenization_utils -   loading file ./new_finetune/merges.txt
05/07/2020 02:55:48 - INFO - transformers.tokenization_utils -   loading file None
05/07/2020 02:55:48 - INFO - transformers.tokenization_utils -   loading file ./new_finetune/special_tokens_map.json
05/07/2020 02:55:48 