## Carregando os pacotes

In [6]:
library(dplyr) # manipulação dos dados
library(ggplot2) # visualização
library(gridExtra) # visualizar vários gráficos juntos
library(tidytext) # mineração de texto
library(wordcloud2) # criação de núvem de palavras

## Lendo os dados

In [2]:
prince_orig <- read.csv("data/prince_raw_data.csv", stringsAsFactors = FALSE)

In [3]:
# Colunas da base de dados
names(prince_orig)

In [7]:
# Selecionando algumas colunas 
prince <- prince_orig  %>% 
    select(lyrics = text, song, year, album, peak, us_pop = US.Pop, us_rnb = US.R.B)

glimpse(prince[139,])

Observations: 1
Variables: 7
$ lyrics <chr> "I just can't believe all the things people say, controversy\n…
$ song   <chr> "controversy"
$ year   <int> 1981
$ album  <chr> "Controversy"
$ peak   <int> 3
$ us_pop <chr> "70"
$ us_rnb <chr> "3"


In [8]:
dim(prince)

In [9]:
str(prince[139,]$lyrics, nchar.max = 300)

 chr "I just can't believe all the things people say, controversy\nAm I Black or White? Am I straight or gay? Controversy\nDo I believe in God? Do I believe in me? Controversy\nControversy, controversy\nI can't understand human curiosity, controversy\nWas it good for you? Was I what you w"| __truncated__


## Limpeza básica

In [10]:
# Função responsável por remover as contrações da língua inglesa
fix.contractions <- function(doc) {
    doc <- gsub("won't", "will not", doc)
    doc <- gsub("can't", "can not", doc)
    doc <- gsub("n't", " not", doc)
    doc <- gsub("'ll", "will", doc)
    doc <- gsub("'re", " are", doc)
    doc <- gsub("'ve", " have", doc)
    doc <- gsub("'m", " am", doc)
    doc <- gsub("'d", " would", doc)
    doc <- gsub("'s", "", doc)
    return(doc)
}

In [11]:
prince$lyrics <- sapply(prince$lyrics, fix.contractions)

In [12]:
# Função responsável por remover caracteres especiais
removeSpecialChars <- function(x) gsub("[^a-zA-Z0-9 ]", " ", x)

# Removendo os caracteres especiais
prince$lyrics <- sapply(prince$lyrics, removeSpecialChars)

In [13]:
# Convertendo todas as letras para minúsculo
prince$lyrics <- sapply(prince$lyrics, tolower)

In [15]:
str(prince[139,]$lyrics, nchar.max = 300)

 chr "i just can not believe all the things people say  controversy am i black or white  am i straight or gay  controversy do i believe in god  do i believe in me  controversy controversy  controversy i can not understand human curiosity  controversy was it good for you  was i what you wa"| __truncated__


In [16]:
summary(prince)

    lyrics              song                year         album          
 Length:824         Length:824         Min.   :1978   Length:824        
 Class :character   Class :character   1st Qu.:1989   Class :character  
 Mode  :character   Mode  :character   Median :1996   Mode  :character  
                                       Mean   :1995                     
                                       3rd Qu.:1999                     
                                       Max.   :2015                     
                                       NA's   :495                      
      peak          us_pop             us_rnb         
 Min.   : 0.00   Length:824         Length:824        
 1st Qu.: 2.00   Class :character   Class :character  
 Median : 7.00   Mode  :character   Mode  :character  
 Mean   :15.48                                        
 3rd Qu.:19.00                                        
 Max.   :88.00                                        
 NA's   :751                   

In [17]:
# Anos dos dados
unique(prince$year)