#Data Cleaning

##Import des librairies et des textes

In [0]:
import pandas as pd
import nltk
nltk.download('punkt')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


True

In [0]:
nekfeu = pd.read_csv("nekfeu.csv")

##Traitement du texte

In [0]:
def split_lines(lyrics):
  lyrics.lyrics = lyrics.lyrics.apply(lambda x: str(x))
  liste = []
  for i in range(len(lyrics)):
    lyric = lyrics.lyrics[i].splitlines()
    liste+=lyric
  dataset = pd.DataFrame(liste, columns=['sentences'])
  return(dataset)

In [0]:
dataset = split_lines(nekfeu)
initial_len = len(dataset)

In [0]:
print('Nous avons {} lignes initialement'.format(initial_len))

Nous avons 8133 lignes initialement


In [0]:
def drop_null_sentence(dataset):
  drop = list()
  for i in range(len(dataset.sentences)):
    if len(dataset.sentences[i].split(' '))<2:
      drop.append(i)
  dataset = dataset.drop(index=drop)
  dataset = dataset.reset_index()
  return(dataset)

In [0]:
dataset = drop_null_sentence(dataset)
print('Il nous reste {} lignes ({}%)'.format(len(dataset),round(len(dataset)/initial_len*100,1)))
data = dataset

Il nous reste 7480 lignes (92.0%)


In [0]:
inputdict = {'?':'', '...':'', '!':'', '(':'', ')':'','"':'', "'":'', '/':'', '.':' ', ',':'', ':':''}

In [0]:
#Suppression de la ponctuation et des caractères de l'inputdict
def text_harmonization(dataset):
  sentences = dataset.sentences.apply(lambda x: x.lower())
  s=' '
  phrase_clean = []
  for sentence in sentences:
    for word, initial in inputdict.items():
      sentence = sentence.replace(word.lower(), initial)
    phrase_clean.append(sentence)
  dataset_harmonized = pd.DataFrame(phrase_clean, columns=["sentences"])
  return(dataset_harmonized)

In [0]:
dataset = text_harmonization(dataset)

In [0]:
# Python program to check if two to get unique values from list
# using traversal function to get unique values

def unique(liste):
    # intilize a null list
    unique_list = []
    # traverse for all elements
    for x in liste:
        # check if exists in unique_list or not
        if x not in unique_list:
            unique_list.append(x)
    return(unique_list)

In [0]:
def del_word_repeat(dataset):
  space = ' '
  data = []
  for sentence in dataset.sentences:
    word = sentence.split()
    if len(word)>0:  
      new_sentence = [word[0]]
    for i in range(1,len(word)):
      if word[i-1]!=word[i]:
        new_sentence.append(word[i])
    new_sentence = space.join(new_sentence)
    data.append((new_sentence))
    dataset = pd.DataFrame(data, columns=['sentences'])
  return(dataset)

In [0]:
dataset = del_word_repeat(dataset)

In [0]:
def del_repeat(dataset):
  sentences = dataset.sentences
  clean = []
  for phrase in sentences:
    split = phrase.split(' ')
    if len(unique(split)) > len(split)/2:
      clean.append(split)
  for i in range(len(clean)):
    s = ' '
    clean[i] = s.join(clean[i])
  return(pd.DataFrame(clean, columns=['sentences']))

In [0]:
dataset = del_repeat(dataset)

In [0]:
def del_sentences_repeat(dataset):
  sentences = dataset.sentences
  clean = []
  for i in range(len(sentences)-2):
    if sentences[i]!=sentences[i+1] and sentences[i]!=sentences[i+2]:
      clean.append(sentences[i])
  dataset = pd.DataFrame(clean, columns=['sentences'])
  return(dataset)

In [0]:
dataset = del_sentences_repeat(dataset)
print('Il nous reste {} lignes ({}%)'.format(len(dataset),round(len(dataset)/initial_len*100,1)))

Il nous reste 7077 lignes (87.0%)


#Export des dataset_clean

In [0]:
booba = pd.read_csv("booba.csv")
damso = pd.read_csv("damso.csv")
guizmo = pd.read_csv("guizmo.csv")
kaaris = pd.read_csv("kaaris.csv")
lomepal = pd.read_csv("lomepal.csv")
nekfeu = pd.read_csv("nekfeu.csv")
nepal = pd.read_csv("nepal.csv")
orelsan = pd.read_csv("orelsan.csv")
pnl = pd.read_csv("pnl.csv")
sch = pd.read_csv("sch.csv")
vald = pd.read_csv("vald.csv")

In [0]:
rap_lyrics = [booba, damso, guizmo, kaaris, lomepal, nekfeu, nepal, orelsan, pnl, sch, vald]
rap_names = ['booba', 'damso', 'guizmo', 'kaaris', 'lomepal', 'nekfeu', 'nepal', 'orelsan', 'pnl', 'sch', 'vald']

for lyric in range(len(rap_lyrics)):
  dt = split_lines(rap_lyrics[lyric])
  initial_len = len(dt)
  dt = drop_null_sentence(dt)
  dt = drop_null_sentence(text_harmonization(dt))
  dt = drop_null_sentence(del_word_repeat(dt))
  dt = del_sentences_repeat(dt)
  dt = del_repeat(dt)
  csv = rap_names[lyric]+'_clean.csv'
  dt.to_csv(csv, index = False, header=True)
  print(rap_names[lyric] + ' -> Il nous reste {} lignes ({}%)'.format(len(dt),round(len(dt)/initial_len*100,1)))

booba -> Il nous reste 10435 lignes (85.2%)
damso -> Il nous reste 4260 lignes (81.3%)
guizmo -> Il nous reste 11910 lignes (87.8%)
kaaris -> Il nous reste 7856 lignes (78.2%)
lomepal -> Il nous reste 4113 lignes (85.2%)
nekfeu -> Il nous reste 7019 lignes (86.3%)
nepal -> Il nous reste 1988 lignes (84.3%)
orelsan -> Il nous reste 4614 lignes (86.0%)
pnl -> Il nous reste 3975 lignes (82.7%)
sch -> Il nous reste 5656 lignes (86.3%)
vald -> Il nous reste 8710 lignes (83.3%)


In [0]:
b2o = pd.read_csv("damso_clean.csv")
b2o.isnull().any()

sentences    False
dtype: bool

#Transformation d'un texte en syllables IPA

In [0]:
pip install epitran



In [0]:
pip install FinnSyll



In [0]:
import epitran
import math
import numpy as np
from finnsyll import FinnSyll
import pandas as pd

f = FinnSyll()
epi = epitran.Epitran('fra-Latn')

INFO:morfessor.io:Loading model from '/usr/local/lib/python3.6/dist-packages/finnsyll/data/finnsyll-morfessor.bin'...
INFO:morfessor.io:Done.


In [0]:
print(epi.transliterate('Ce ci est une ri me'))

sə si ɛst ynə ri m


In [0]:
sequence = "vitamine verte pédé des fois jmimagine espérer"
sequence1 = "vis ta vie jette les dés même si les pyramides guettent pédé"
sequence2 = "la vie ma fait connaître des putains d’braves avec des têtes de lossbar"
sequence3 = "mais ces lascars qui font du zgar et mont lance-ba méritent loscar"
sequence4 = "seuls les coups durs officialisent les reufs"
sequence5 = "quand tes dans le vice la vie cest reuch"
sequence6 = "il faut quje fasse le point je nsuis pas devin mais si je clamse demain"
sequence7 = "jveux pas qulimpact dune balle de flingue devienne mon clap de fin"

In [0]:
sequences = [sequence, sequence1, sequence2, sequence3, sequence4, sequence5, sequence6, sequence7]

#Transformation des phrases en syllables puis en phonétique
 

In [0]:
inputdict = {'.':' ','’':''}
#Suppression de la ponctuation et des caractères de l'inputdict
def split_syll(sentences):
  s=' '
  phrase_clean = []
  for sentence in sentences:
    for word, initial in inputdict.items():
      sentence = sentence.replace(word.lower(), initial)
    phrase_clean.append(sentence)
  return(phrase_clean)

In [0]:
def syllabify_sentences(sentences):
  new_sentences = []
  for i in sentences:
    word = f.syllabify(i)
    word = split_syll(word)
    i = epi.transliterate(word[0])
    new_sentences.append(i)
  return(new_sentences)

In [0]:
syll_sequences = syllabify_sentences(sequences)

In [0]:
IPA_phonemes = [['ɑ','a'],['e', 'ɛ', 'ɛː', 'ə'],['i', 'j'],['o','ɔ'],['wa','wɑ','wɛ̃'],['u','w'],['y','ɥ']
              ,['ø','œ','e'],['ɔ̃'], ['ɑ̃'], ['ɛ̃','in','œ̃'], ['b'], ['ks','k','kw'],['sj','si']]

In [0]:
for oral in IPA_char:
  IPA_phonemes.append([oral])

In [0]:
oral_vowel = ['i','e','ɛ','y','ø','ə','œ','u','o','ɔ','ɑ','a']
oral_cons = ['b','k','s','g','ʃ','d','f','ʒ','ɡw','l','p','t','v','z','ʁ',]
nas_cons = ['m','n','ɲ','ŋ']
nas_vowel = ['ɛ̃','œ̃','ɔ̃','ɑ̃']
semi_cons = ['j','w','ɥ']
IPA_char = oral_vowel+oral_cons+nas_cons+nas_vowel+semi_cons

In [0]:
def matrixer(sequence):
  #Use IPA from the 1st sequence in sequences_list
  vowels = IPA_phonemes
  vector = list()
#  sequence = sequence.lower()
  for vowel in vowels:
    somme = 0
    for v in vowel:
      somme+=sequence.count(v)
    vector.append(somme)
#  seq = ''.join([l for l in sequence if l in vowels])
#  for vowel in vowels:
#    for v in vowel:
#      if seq[-1] in v:
#        vector.append(1)
#      else:
#        vector.append(0)
  return vector

In [0]:
def dotproduct(v1, v2):
  return sum((a*b) for a, b in zip(v1, v2))

def length(v):
  return math.sqrt(dotproduct(v, v))

def angle(v1, v2):
  return math.acos(dotproduct(v1, v2) / (length(v1) * length(v2)))

In [0]:
def compare(list_of_sequences):
  print('- {}'.format(list_of_sequences[0]))
  sequences=[]
  for i in list_of_sequences:
    sequences.append(syllabify_sentences(i))
  vectors = list()
  angles = list()
  
  for seq in sequences:
    vectors.append(matrixer(seq))
  
  for i in range(1,len(list_of_sequences)):
    angles.append(angle(vectors[0],vectors[i]))
#    print("Phrase:{} | Angle:{}".format(list_of_sequences[i],round(angle(vectors[0],vectors[i]),2)))

  return("Best rime: {}".format(list_of_sequences[angles.index(min(angles))+1]))

In [0]:
for sequence in sequences:
  print(sequence)

vitamine verte pédé des fois jmimagine espérer
vis ta vie jette les dés même si les pyramides guettent pédé
la vie ma fait connaître des putains d’braves avec des têtes de lossbar
mais ces lascars qui font du zgar et mont lance-ba méritent loscar
seuls les coups durs officialisent les reufs
quand tes dans le vice la vie cest reuch
il faut quje fasse le point je nsuis pas devin mais si je clamse demain
jveux pas qulimpact dune balle de flingue devienne mon clap de fin


In [0]:
def unique(liste):
    # intilize a null list
    unique_list = []
    # traverse for all elements
    for x in liste:
        # check if exists in unique_list or not
        if x not in unique_list:
            unique_list.append(x)
    return(unique_list)

In [0]:
IPA_syll = unique(syll_sequences[0])

In [0]:
compare(sequences)

- vitamine verte pédé des fois jmimagine espérer


'Best rime: vis ta vie jette les dés même si les pyramides guettent pédé'

In [0]:
print("{}".format(IPA_syll))
print("{}".format(matrixer(sequence)))
print("{}".format(matrixer(sequence1)))
print("{}".format(matrixer(sequence3)))

['t', 'ɔ', 'y', ' ', 'p', 'r', 'ɛ', 's', 'd', 'i', 'k', 'n', 'ʒ', 'v', 'a', 'm', 'ə']
[4, 9, 5, 1, 0, 4, 0, 9, 0, 0, 2, 1, 0, 0, 4, 9, 0, 0, 0, 0, 0, 4, 1, 0, 0, 4, 3, 1, 1, 4, 2, 2, 0, 1, 1, 0, 0, 0, 0, 5, 2, 6, 0, 0, 0, 0, 0, 0, 1, 0, 0]
[2, 9, 5, 0, 0, 1, 1, 9, 0, 0, 0, 0, 0, 1, 4, 9, 0, 1, 0, 0, 0, 1, 0, 0, 0, 2, 2, 0, 6, 3, 0, 2, 0, 1, 6, 0, 0, 0, 0, 2, 3, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0]
[7, 4, 3, 3, 0, 2, 0, 4, 0, 0, 0, 1, 0, 0, 3, 4, 0, 0, 0, 0, 0, 2, 3, 0, 0, 7, 0, 1, 5, 1, 1, 0, 0, 1, 5, 1, 0, 0, 0, 3, 3, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0]


In [0]:
import random
sequences = pd.read_csv("damso_clean.csv")
seq = list(sequences.sentences)
random.shuffle(seq)
print(unique(syllabify_sentences(seq[0])))
IPA_syll = unique(syllabify_sentences(seq[0]))
compare(seq)

['ʒ', 't', 'ə', ' ', 'f', 'r', 'a', 'i', 'p', 'l', 'y', 's', 'd', 'k', 'ɔ', 'm', 'n', 'v', 'e']
- jte ferai plus dcompliments jvois qutu casses déjà les couilles


'Best rime: jfais une croix dessus comme pilate avec jésus'