In [0]:
!pip install tensorflow-gpu
!pip install h5py
!pip install epitran

Collecting tensorflow-gpu
[?25l  Downloading https://files.pythonhosted.org/packages/0a/93/c7bca39b23aae45cd2e85ad3871c81eccc63b9c5276e926511e2e5b0879d/tensorflow_gpu-2.1.0-cp36-cp36m-manylinux2010_x86_64.whl (421.8MB)
[K     |████████████████████████████████| 421.8MB 34kB/s 
[?25hCollecting tensorboard<2.2.0,>=2.1.0
[?25l  Downloading https://files.pythonhosted.org/packages/d9/41/bbf49b61370e4f4d245d4c6051dfb6db80cec672605c91b1652ac8cc3d38/tensorboard-2.1.1-py3-none-any.whl (3.8MB)
[K     |████████████████████████████████| 3.9MB 44.2MB/s 
Collecting tensorflow-estimator<2.2.0,>=2.1.0rc0
[?25l  Downloading https://files.pythonhosted.org/packages/18/90/b77c328a1304437ab1310b463e533fa7689f4bfc41549593056d812fab8e/tensorflow_estimator-2.1.0-py2.py3-none-any.whl (448kB)
[K     |████████████████████████████████| 450kB 68.8MB/s 
[?25hCollecting gast==0.2.2
  Downloading https://files.pythonhosted.org/packages/4e/35/11749bf99b2d4e3cceb4d55ca22590b0d7c2c62b9de38ac4a4a7f4687421/gast-0.2

In [0]:
import tensorflow as tf

from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.layers import Embedding, LSTM, Dense, Dropout, Bidirectional
from tensorflow.keras.preprocessing.text import Tokenizer
tokenizer = Tokenizer()
from tensorflow.keras.callbacks import EarlyStopping
from tensorflow.keras.models import Sequential,model_from_json
import tensorflow.keras.utils as ku 

from sklearn.preprocessing import OneHotEncoder
enc = OneHotEncoder()

import pandas as pd
import numpy as np
from numpy.random import choice
import string, os 
import h5py
import epitran
import math

import warnings
warnings.filterwarnings("ignore")
warnings.simplefilter(action='ignore', category=FutureWarning)

In [0]:
booba = pd.read_csv("booba.csv",index_col=0)
damso = pd.read_csv("damso.csv",index_col=0)
guizmo = pd.read_csv("guizmo.csv",index_col=0)
kaaris = pd.read_csv("kaaris.csv",index_col=0)
lomepal = pd.read_csv("lomepal.csv",index_col=0)
nekfeu = pd.read_csv("nekfeu.csv",index_col=0)
nepal = pd.read_csv("nepal.csv",index_col=0)
orelsan = pd.read_csv("orelsan.csv",index_col=0)
pnl = pd.read_csv("pnl.csv",index_col=0)
sch = pd.read_csv("sch.csv",index_col=0)
vald = pd.read_csv("vald.csv",index_col=0)

df = booba.append(damso).append(guizmo).append(kaaris).append(lomepal).append(nekfeu).append(nepal).append(orelsan).append(pnl).append(sch).append(vald).reset_index(drop=True)

In [0]:
def get_sequence_of_tokens(corpus):
    ## tokenization
    tokenizer.fit_on_texts(corpus)
    total_words = len(tokenizer.word_index) + 1
    
    ## convert data to sequence of tokens 
    input_sequences = []
    for line in corpus:
        token_list = tokenizer.texts_to_sequences([line])[0]
        for i in range(1, len(token_list)):
            n_gram_sequence = token_list[:i+1]
            input_sequences.append(n_gram_sequence)
    return input_sequences, total_words, tokenizer

def generate_padded_sequences(input_sequences):
    max_sequence_len = max([len(x) for x in input_sequences])
    input_sequences = np.array(pad_sequences(input_sequences, maxlen=max_sequence_len, padding='pre'))
    
    predictors, label = input_sequences[:,:-1],input_sequences[:,-1]
    return predictors, max_sequence_len, label

def create_model(max_sequence_len, total_words):
    input_len = max_sequence_len - 1
    model = tf.keras.Sequential()
    
    # Add Input Embedding Layer
    model.add(Embedding(total_words, 10, input_length=input_len))
    
    model.add(LSTM(128))
    
    model.add(Dense(total_words, activation='softmax'))

    model.compile(loss='sparse_categorical_crossentropy', optimizer='adam')
    
    return model

def make_model(df=df):
  sequences = []
  for i in range(len(df)):
    sequences.extend(df.lyrics_clean[i].replace("’"," ").replace("“"," ").split("\n"))
  inp_sequences, total_words, tokenizer = get_sequence_of_tokens(sequences)
  predictors, max_sequence_len, label = generate_padded_sequences(inp_sequences)
  model = create_model(max_sequence_len, total_words)
  
  return total_words,max_sequence_len,predictors,label,model,tokenizer

def generate_sent(total_words, next_words, model, max_sequence_len, tokenizer, seed_text=""):
    seed_text = [seed_text]
    for _ in range(next_words):
        token_list = tokenizer.texts_to_sequences([" ".join(seed_text)])[0]
        token_list = pad_sequences([token_list], maxlen=max_sequence_len-1, padding='pre')
        predictions = model.predict(token_list, verbose=0).reshape(-1)
        predicted = choice(total_words, 2, replace=False, p = predictions)

        for word,index in tokenizer.word_index.items():
          if index == predicted[0]:
            output_word = word
            break

        try:
          if output_word == seed_text[-1] and seed_text[-1] == seed_text[-2]:
            for word,index in tokenizer.word_index.items():
              if index == predicted[1]:
                output_word = word
                break

        except IndexError:
          pass

        seed_text.append(output_word)
    return " ".join(seed_text)

def save_model(model):
  model_json = model.to_json()
  with open("model.json", "w") as json_file:
    json_file.write(model_json)
  model.save_weights("model.h5")
  print("Saved model to disk")

def load_model(name="model.json",weights="model.h5"):
  json_file = open('model.json', 'r')
  loaded_model_json = json_file.read()
  json_file.close()
  loaded_model = model_from_json(loaded_model_json)
  loaded_model.load_weights("model.h5")
  return loaded_model

def dotproduct(v1, v2):
  return sum((a*b) for a, b in zip(v1, v2))

def length(v):
  return math.sqrt(dotproduct(v, v))

def angle(v1, v2):
  return math.acos(dotproduct(v1, v2) / (length(v1) * length(v2)))

def matrixer(sequence):
  vowels = ["j","w","ɥ","a","ɑ","e","ɛ","ɛː","ə","i","œ","ø","o","ɔ","u","y","ɑ̃","ɛ̃","œ̃","ɔ̃"]
  phonemes = [['ɑ','a'],['e', 'ɛ', 'ɛː', 'ə'],['i', 'j'],['o','ɔ'],['wa','wɑ','wɛ̃'],['u','w'],['y','ɥ']
              ,['ø','œ','e'],['ɔ̃'], ['ɑ̃'], ['ɛ̃','in','œ̃'], ['b'], ['ks','k','kw'],['sj','si']]
  vector = list()
  rhyme = list()

  epi = epitran.Epitran('fra-Latn')

  sequence = sequence.lower()
  sequence = epi.transliterate(sequence)

  for vowel in vowels:
    vector.append(sequence.count(vowel))
  
  for phoneme in phonemes:
    somme = 0
    for vowel in phoneme:
      somme+=sequence.count(vowel)
    vector.append(somme)

  seq = ''.join([l for l in sequence if l in vowels])
  for vowel in vowels:
    if seq[-1] == vowel:
      rhyme.append(1)
    else:
      rhyme.append(0)
  return [vector,rhyme]

def compare(context,sequences):
  sequences.insert(0,context)
  vectors = list()
  rhymes = list()
  angles = list()
  
  for seq in sequences:
    vectors.append(matrixer(seq)[0])
    rhymes.append(matrixer(seq)[1])
  
  for i in range(len(vectors)-1):
    if rhymes[0] == rhymes[i+1]:
      rhyme = 0
    else:
      rhyme = 1
    angles.append(angle(vectors[0],vectors[i+1])+rhyme)
  
  print(angles)

  return sequences[angles.index(min(angles))+1]

def couplet(model,max_sequence_len,tokenizer,sents=5,comp=5):
  sentlist = [generate_sent(np.random.randint(5,21),model,max_sequence_len,tokenizer)]
  for i in range(sents):
    context = sentlist[-1]
    trials=list()
    for i in range(comp):
      trials.append(generate_sent(np.random.randint(5,21),model,max_sequence_len,tokenizer))
    sentlist.append(compare(context, trials))
  return "\n".join(sentlist)

def refrain(model, tokenizer, comp=5, form=None):
  sentlist = [generate_sent(np.random.randint(5,15),model,max_sequence_len, tokenizer)]
  context = sentlist[0]
  trials = list()
  for j in range(comp):
    trials.append(generate_sent(np.random.randint(5,15),model,max_sequence_len,tokenizer))
  sentlist.append(compare(context,trials))
  if form == None:
    form = np.random.randint(0,3)
  if form == 0:
    return "\n".join([sentlist[0],sentlist[1],sentlist[0],sentlist[1]])
  elif form == 1:
    return "\n".join([sentlist[0],sentlist[0],sentlist[1],sentlist[1]])
  elif form == 2:
    return "\n".join([sentlist[0],sentlist[1],sentlist[1],sentlist[0]])

In [0]:
total_words, max_sequence_len, predictors, label, model, tokenizer = make_model(pnl)

In [0]:
model.summary()

Model: "sequential"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding (Embedding)        (None, 46, 10)            49100     
_________________________________________________________________
lstm (LSTM)                  (None, 128)               71168     
_________________________________________________________________
dense (Dense)                (None, 4910)              633390    
Total params: 753,658
Trainable params: 753,658
Non-trainable params: 0
_________________________________________________________________


In [0]:
model.fit(x=predictors, y=label, epochs=30)

Train on 39033 samples
Epoch 1/30
Epoch 2/30
Epoch 3/30
Epoch 4/30
Epoch 5/30
Epoch 6/30
Epoch 7/30
Epoch 8/30
Epoch 9/30
Epoch 10/30
Epoch 11/30
Epoch 12/30
Epoch 13/30
Epoch 14/30
Epoch 15/30
Epoch 16/30
Epoch 17/30
Epoch 18/30
Epoch 19/30
Epoch 20/30
Epoch 21/30
Epoch 22/30
Epoch 23/30
Epoch 24/30
Epoch 25/30
Epoch 26/30
Epoch 27/30
Epoch 28/30
Epoch 29/30
Epoch 30/30


<tensorflow.python.keras.callbacks.History at 0x7f43bc524898>

In [0]:
generate_sent(total_words, 10, model, max_sequence_len,tokenizer, "ouais")

'ouais plus au fond de la sère l orfèvres pas qu'

In [0]:
print(couplet(model,max_sequence_len,tokenizer))

[1.2871921089465703, 1.4284525545876783, 1.4853002229910166, 1.880208911174086, 0.6404342992971294]
[0.4889340593007351, 1.862201467699339, 1.9823163392199068, 1.7780432857004544, 1.5752903348411684]
[1.390720773122277, 1.447255423508109, 1.4482586717402668, 0.651547090712549, 1.663686908743296]
[1.9390702522212604, 1.6306585595278458, 1.7043991861118855, 1.7974961179982771, 1.6928592392357809]
[1.489453454476958, 1.8089218810405066, 1.7975615252157127, 1.3948803566773706, 1.604895314547286]
 ton bâtard t court pas la semelle j maille j emmène ce des
 ça fait pas trop calmant que la famille c
 l histoire à les fils de d sentiments là pas comme l
 et j suis pas buraliste j me
 les anges plus fait l marques tout sur la rue mètres hmm incompréhensibles pas une
 des talons la gueule sur les escalier je comptes que j
