# Versão simples

## Bibliotecas

In [1]:
#Adaptado de https://www.analyticsvidhya.com/blog/2018/02/the-different-methods-deal-text-data-predictive-python/
# Importar as bibliotecas
import pandas as pd
import numpy as np
import re
import nltk
import os
from nltk.tokenize import word_tokenize
from collections import defaultdict
from nltk.corpus import wordnet as wn
from nltk.sentiment.vader import SentimentIntensityAnalyzer

## Módulos

In [2]:
# Download dos elementos necessários para conduzir as análises
nltk.download('punkt')
nltk.download('wordnet')
nltk.download('averaged_perceptron_tagger')
nltk.download('omw-1.4')
nltk.download('vader_lexicon')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /root/nltk_data...
[nltk_data]   Unzipping taggers/averaged_perceptron_tagger.zip.
[nltk_data] Downloading package omw-1.4 to /root/nltk_data...
[nltk_data] Downloading package vader_lexicon to /root/nltk_data...


True

## Pré-processamento

In [4]:
# Carregar os arquivos .txt com as letras
path = '/content/drive/MyDrive/lyrics/' # criar pasta, caso utilize o Colab
files = os.listdir(path)
files.sort(key=lambda f: int(re.sub('\D', '', f)))

text  = []
id = []

for line in files:
  with open(path+line, 'r') as f:
    txt = f.read()
    text.append(txt)
    id.append(line)

In [5]:
# Criar a base de dados (df)
df = pd.DataFrame(zip(id, text), columns = ['id','lyrics'])
df.head(3)

Unnamed: 0,id,lyrics
0,1.txt,"Well, it's 1969, okay?\nAll across the USA\nIt..."
1,2.txt,Out of my mind on Saturday night\n1970 rollin'...
2,3.txt,"I'm on it, ooh, I'm on it\nI'm so on it, and h..."


In [6]:
# Limpar as letras
replacer = {'\n':' ',"[\[].*?[\]]": "",'[!"#%\'()*+,-./:;<=>?@\[\]^_`{|}~1234567890’""′‘\\\]':" ", ' +': ' '}

df['cleanLyrics'] = df['lyrics'].replace(replacer, regex=True).apply(lambda x: x.strip()).apply(lambda x: " ".join(x.lower() for x in x.split()))
df.head(3)

Unnamed: 0,id,lyrics,cleanLyrics
0,1.txt,"Well, it's 1969, okay?\nAll across the USA\nIt...",well it s okay all across the usa it s another...
1,2.txt,Out of my mind on Saturday night\n1970 rollin'...,out of my mind on saturday night rollin in sig...
2,3.txt,"I'm on it, ooh, I'm on it\nI'm so on it, and h...",i m on it ooh i m on it i m so on it and howev...


## Características

In [7]:
# Número de caracteres
df['charCount'] = df['cleanLyrics'].str.len()
# Número de palavras por letras
df['wordCount'] = df['cleanLyrics'].str.split().str.len()
df.head(3)

Unnamed: 0,id,lyrics,cleanLyrics,charCount,wordCount
0,1.txt,"Well, it's 1969, okay?\nAll across the USA\nIt...",well it s okay all across the usa it s another...,97,22
1,2.txt,Out of my mind on Saturday night\n1970 rollin'...,out of my mind on saturday night rollin in sig...,124,25
2,3.txt,"I'm on it, ooh, I'm on it\nI'm so on it, and h...",i m on it ooh i m on it i m so on it and howev...,419,94


In [8]:
# Média de palavras por letras
def avg_word(sentence):
  words = sentence.split()
  return (sum(len(word) for word in words)/len(words))

In [9]:
df['avgWord'] = df['cleanLyrics'].apply(lambda x: avg_word(x))
df.head(3)

Unnamed: 0,id,lyrics,cleanLyrics,charCount,wordCount,avgWord
0,1.txt,"Well, it's 1969, okay?\nAll across the USA\nIt...",well it s okay all across the usa it s another...,97,22,3.454545
1,2.txt,Out of my mind on Saturday night\n1970 rollin'...,out of my mind on saturday night rollin in sig...,124,25,4.0
2,3.txt,"I'm on it, ooh, I'm on it\nI'm so on it, and h...",i m on it ooh i m on it i m so on it and howev...,419,94,3.468085


In [10]:
df.reset_index(inplace = True)
df['uniqueWords'] = pd.Series(np.arange(len(df)))

In [11]:
current = 0
for row in df.itertuples():
    df['uniqueWords'][current] = len(np.unique(df['cleanLyrics'].str.split()[current]))
    current = current + 1
df.head(3)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  This is separate from the ipykernel package so we can avoid doing imports until


Unnamed: 0,index,id,lyrics,cleanLyrics,charCount,wordCount,avgWord,uniqueWords
0,0,1.txt,"Well, it's 1969, okay?\nAll across the USA\nIt...",well it s okay all across the usa it s another...,97,22,3.454545,18
1,1,2.txt,Out of my mind on Saturday night\n1970 rollin'...,out of my mind on saturday night rollin in sig...,124,25,4.0,23
2,2,3.txt,"I'm on it, ooh, I'm on it\nI'm so on it, and h...",i m on it ooh i m on it i m so on it and howev...,419,94,3.468085,55


In [12]:
df['uniqueWordsProp']  =  df['uniqueWords'] / df['wordCount']
df.head(3)

Unnamed: 0,index,id,lyrics,cleanLyrics,charCount,wordCount,avgWord,uniqueWords,uniqueWordsProp
0,0,1.txt,"Well, it's 1969, okay?\nAll across the USA\nIt...",well it s okay all across the usa it s another...,97,22,3.454545,18,0.818182
1,1,2.txt,Out of my mind on Saturday night\n1970 rollin'...,out of my mind on saturday night rollin in sig...,124,25,4.0,23,0.92
2,2,3.txt,"I'm on it, ooh, I'm on it\nI'm so on it, and h...",i m on it ooh i m on it i m so on it and howev...,419,94,3.468085,55,0.585106


In [13]:
# Análise de sentimentos
sid = SentimentIntensityAnalyzer()

sentiments = df.apply(lambda r: sid.polarity_scores(r['lyrics']), axis=1)

In [14]:
d = pd.DataFrame(list(sentiments))
df = df.join(d)
df.dropna(inplace=True)

In [15]:
df.to_excel('lyrics.xlsx', index= False)

# Versão Completa

## Bibliotecas

In [None]:
#Adaptado de https://www.analyticsvidhya.com/blog/2018/02/the-different-methods-deal-text-data-predictive-python/
# Importar as bibliotecas
import pandas as pd
import numpy as np
import re
import nltk
import os
from nltk.corpus import stopwords
from nltk.stem import 	WordNetLemmatizer
from nltk.stem import PorterStemmer
from nltk.tokenize import word_tokenize
from nltk import pos_tag
from collections import defaultdict
from nltk.corpus import wordnet as wn
from textblob import TextBlob
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction.text import CountVectorizer

## Módulos



In [None]:
# Download dos elementos necessários para conduzir as análises
nltk.download('stopwords')
nltk.download('punkt')
nltk.download('wordnet')
stop = stopwords.words('english')
nltk.download('averaged_perceptron_tagger')
nltk.download('omw-1.4')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Unzipping corpora/wordnet.zip.
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /root/nltk_data...
[nltk_data]   Unzipping taggers/averaged_perceptron_tagger.zip.
[nltk_data] Downloading package omw-1.4 to /root/nltk_data...
[nltk_data]   Unzipping corpora/omw-1.4.zip.


True

In [None]:
# Acrescentar novas stop words
_new_stopwords_to_add = ['whoo', 'ah', 'usa', 'ohhh','ah','yeah','ye','ahahah', 'okay', 'oh','ahahah', 'hahahah', 'aaaaaaaaaaaah', 'ooh']
stop += _new_stopwords_to_add

In [None]:
# POS TAG
tag_map = defaultdict(lambda : wn.NOUN)
tag_map['J'] = wn.ADJ
tag_map['V'] = wn.VERB
tag_map['R'] = wn.ADV

wl = WordNetLemmatizer()

## Funções

In [None]:
# Função para lemmatizar junto com pos_tag
def convert(text):
    lemmatized_text = []
    for i, tag in pos_tag(text.split()):
        lemmatized_text.append(str(wl.lemmatize(i,tag_map[tag[0]])))

    return ' '.join(lemmatized_text)

## Pré-processamento

In [None]:
# Carregar os arquivos .txt com as letras
path = '/content/lyrics/' # criar pasta, caso utilize o Colab
files = os.listdir(path)
files.sort(key=lambda f: int(re.sub('\D', '', f)))

text  = []
id = []

for line in files:
  with open(path+line, 'r') as f:
    txt = f.read()
    text.append(txt)
    id.append(line)

In [None]:
# Criar a base de dados (df)
df = pd.DataFrame(zip(id, text), columns = ['id','lyrics'])
df.head(3)

Unnamed: 0,id,lyrics
0,4.txt,"I'm on it, ooh, I'm on it\nI'm so on it, and h..."
1,9.txt,Hey I'm one big queen\nNo one can stop me\nRed...
2,10.txt,For fifty years they've been married\nAnd they...


In [None]:
# Limpar as letras
replacer = {'\n':' ',"[\[].*?[\]]": "",'[!"#%\'()*+,-./:;<=>?@\[\]^_`{|}~1234567890’""′‘\\\]':" ", ' +': ' '}

df['cleanLyrics'] = df['lyrics'].replace(replacer, regex=True).apply(lambda x: x.strip()).apply(lambda x: " ".join(x.lower() for x in x.split()))
df.head(3)

Unnamed: 0,id,lyrics,cleanLyrics
0,4.txt,"I'm on it, ooh, I'm on it\nI'm so on it, and h...",i m on it ooh i m on it i m so on it and howev...
1,9.txt,Hey I'm one big queen\nNo one can stop me\nRed...,hey i m one big queen no one can stop me red l...
2,10.txt,For fifty years they've been married\nAnd they...,for fifty years they ve been married and they ...


In [None]:
# Remover stop words
df['cleanLyricsStop'] = df['cleanLyrics'].apply(lambda x: " ".join(x for x in x.split() if x not in stop))
df.head(3)

Unnamed: 0,id,lyrics,cleanLyrics,cleanLyricsStop
0,4.txt,"I'm on it, ooh, I'm on it\nI'm so on it, and h...",i m on it ooh i m on it i m so on it and howev...,however want get tonight ho night ho get beat ...
1,9.txt,Hey I'm one big queen\nNo one can stop me\nRed...,hey i m one big queen no one can stop me red l...,hey one big queen one stop red light red green...
2,10.txt,For fifty years they've been married\nAnd they...,for fifty years they ve been married and they ...,fifty years married wait fifty first roll arou...


In [None]:
# Corrigir escrita (ex: rollin -> rolling)
df['cleanLyricsStop'] = df['cleanLyricsStop'].apply(lambda x: str(TextBlob(x).correct()))
df.head(3)

Unnamed: 0,id,lyrics,cleanLyrics,cleanLyricsStop
0,4.txt,"I'm on it, ooh, I'm on it\nI'm so on it, and h...",i m on it ooh i m on it i m so on it and howev...,however want get tonight ho night ho get beat ...
1,9.txt,Hey I'm one big queen\nNo one can stop me\nRed...,hey i m one big queen no one can stop me red l...,hey one big queen one stop red light red green...
2,10.txt,For fifty years they've been married\nAnd they...,for fifty years they ve been married and they ...,fifty years married wait fifty first roll arou...


In [None]:
#Lemmatizar
df['lyricsLemma'] = df['cleanLyricsStop'].apply(lambda x: convert(x))
df.head(3)

Unnamed: 0,id,lyrics,cleanLyrics,cleanLyricsStop,lyricsLemma
0,4.txt,"I'm on it, ooh, I'm on it\nI'm so on it, and h...",i m on it ooh i m on it i m so on it and howev...,however want get tonight ho night ho get beat ...,however want get tonight ho night ho get beat ...
1,9.txt,Hey I'm one big queen\nNo one can stop me\nRed...,hey i m one big queen no one can stop me red l...,hey one big queen one stop red light red green...,hey one big queen one stop red light red green...
2,10.txt,For fifty years they've been married\nAnd they...,for fifty years they ve been married and they ...,fifty years married wait fifty first roll arou...,fifty year marry wait fifty first roll around ...


In [None]:
'''
#Stemming
st = PorterStemmer()
df['lyricsStemm'] = df['cleanLyricsStop'].apply(lambda x: " ".join([st.stem(word) for word in x.split()]))
df.head(3)

'''

## Características

In [None]:
# Número de palavras por letras
df['wordCount'] = df['cleanLyrics'].str.split().str.len()
df.head(3)

Unnamed: 0,id,lyrics,cleanLyrics,cleanLyricsStop,lyricsLemma,wordCount
0,4.txt,"I'm on it, ooh, I'm on it\nI'm so on it, and h...",i m on it ooh i m on it i m so on it and howev...,however want get tonight ho night ho get beat ...,however want get tonight ho night ho get beat ...,94
1,9.txt,Hey I'm one big queen\nNo one can stop me\nRed...,hey i m one big queen no one can stop me red l...,hey one big queen one stop red light red green...,hey one big queen one stop red light red green...,48
2,10.txt,For fifty years they've been married\nAnd they...,for fifty years they ve been married and they ...,fifty years married wait fifty first roll arou...,fifty year marry wait fifty first roll around ...,47


In [None]:
# Número de stop words por letras
df['stopwords']  =  df['cleanLyrics'].apply(lambda x: len([x for x in x.split() if x in stop]))
df.head(3)

Unnamed: 0,id,lyrics,cleanLyrics,cleanLyricsStop,lyricsLemma,wordCount,stopwords
0,4.txt,"I'm on it, ooh, I'm on it\nI'm so on it, and h...",i m on it ooh i m on it i m so on it and howev...,however want get tonight ho night ho get beat ...,however want get tonight ho night ho get beat ...,94,54
1,9.txt,Hey I'm one big queen\nNo one can stop me\nRed...,hey i m one big queen no one can stop me red l...,hey one big queen one stop red light red green...,hey one big queen one stop red light red green...,48,19
2,10.txt,For fifty years they've been married\nAnd they...,for fifty years they ve been married and they ...,fifty years married wait fifty first roll arou...,fifty year marry wait fifty first roll around ...,47,28


In [None]:
# Número de palavras por letras (lemma)
df['wordCountLemma']  =  df['lyricsLemma'].str.split().str.len()
df.head(3)

In [None]:
#df.reset_index(inplace = True)
df['uniqueLyrics'] = pd.Series(np.arange(len(df)))

In [None]:
df['uniqueLyrics']

0        0
1        1
2        2
3        3
4        4
      ... 
914    914
915    915
916    916
917    917
918    918
Name: uniqueLyrics, Length: 919, dtype: int64

In [None]:
# Riqueza do vocabulário
current = 0
for row in df.itertuples():
    df['uniqueLyrics'][current] = len(np.unique(df['lyricsLemma'].str.split()[current]))
    current = current + 1
df.head(3)

## BOW e TF-IDF

In [None]:
# Bag of words (BOW)
bow = CountVectorizer(max_features=50, ngram_range=(2,2),analyzer = "word")
train_bow = bow.fit_transform(df['lyricsLemma'])
lemmaBOW2gram = pd.DataFrame(bow.transform(df['lyricsLemma']).toarray(), columns=sorted(bow.vocabulary_.keys()))
lemmaBOW2gram

In [None]:
#TFIDF
tfidf = TfidfVectorizer(max_features=50, ngram_range=(2,2),analyzer = "word")
train_vect = tfidf.fit_transform(df['lyricsLemma'])

lemmaTFIDF2gram = pd.DataFrame(tfidf.transform(df['lyricsLemma']).toarray(), columns=sorted(tfidf.vocabulary_.keys()))
lemmaTFIDF2gram