Analizando Doja Cat Lyrics com Python

In [1]:
%matplotlib inline

import pandas as pd
import string
import seaborn as sns
import matplotlib.pyplot as plt
import collections
import nltk
nltk.download()
from nltk.tokenize import word_tokenize
from nltk.sentiment import SentimentIntensityAnalyzer

-- Load Dataset

In [3]:
all_lyrics = pd.read_csv('Doja_Cat_lyrics_all.csv')

In [4]:
all_lyrics.columns

Index(['Unnamed: 0', 'Track Number', 'Track Title', 'Album Name',
       'Year Released', 'Lyric'],
      dtype='object')

In [5]:
all_lyrics.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3248 entries, 0 to 3247
Data columns (total 6 columns):
 #   Column         Non-Null Count  Dtype 
---  ------         --------------  ----- 
 0   Unnamed: 0     3248 non-null   int64 
 1   Track Number   3248 non-null   int64 
 2   Track Title    3248 non-null   object
 3   Album Name     3248 non-null   object
 4   Year Released  3248 non-null   int64 
 5   Lyric          3248 non-null   object
dtypes: int64(3), object(3)
memory usage: 152.4+ KB


In [6]:
all_lyrics.head()

Unnamed: 0.1,Unnamed: 0,Track Number,Track Title,Album Name,Year Released,Lyric
0,0,1,Woman,Planet Her,2021,"b'Hey, woman\n'"
1,1,1,Woman,Planet Her,2021,"b'Hey, woman\n'"
2,2,1,Woman,Planet Her,2021,b'Woman\n'
3,3,1,Woman,Planet Her,2021,b'Let me be your woman\n'
4,4,1,Woman,Planet Her,2021,"b'Woman, woman, woman (\n'"


In [7]:
all_lyrics.tail()

Unnamed: 0.1,Unnamed: 0,Track Number,Track Title,Album Name,Year Released,Lyric
3243,46,49,Control,Purrr! - EP,2014,"b'Way-o, oh ah\n'"
3244,47,49,Control,Purrr! - EP,2014,b'Way-o ah\n'
3245,48,49,Control,Purrr! - EP,2014,"b'Way-o, oh ah\n'"
3246,49,49,Control,Purrr! - EP,2014,b'Way-o ah\n'
3247,50,49,Control,Purrr! - EP,2014,"b'Way-o, oh ah\n'"


-- Clean the lyric text
To accurately count keyword mentions, we need to make everything lowercase everything, remove punctuation, and exclude stop words.

In [8]:
#lowercase
all_lyrics['new_lyrics'] = all_lyrics['Lyric'].str.lower()

#remove punctuation
all_lyrics['new_lyrics']= all_lyrics['new_lyrics'].str.replace('[^\w\s]',' ')

#remove stopwords
stop = ['is','it','oh','of','uh','that','this','yeah','le','might',
        'a','am', 'was', 'were', 'be', 'being', 'been','la','hey','b', 'n']

all_lyrics['new_lyrics'] = all_lyrics['new_lyrics'].apply(lambda x: ' '.join([word for word in x.split() if word not in (stop)]))

  all_lyrics['new_lyrics']= all_lyrics['new_lyrics'].str.replace('[^\w\s]',' ')


In [9]:
#Love language related words
words_affirmation = ['love','friend','friendship','great','proud','baby','mine','feel','feelings','perfect',
                     'precious','like you','beauty']
quality_time = ['watch','never','now','time','spend','text','texting','follow','chill','alone','play',
                   'stay','party','talking','talk','sleep']
physical_touch = ['kiss','love','sex','fuck','eyes','fight','hold','sexuality','face','body','mouth','dick',
                'ass','touch','eye','pussy','eye']
acts_service = ['turn','clean','stay','act','patient','relax','drive','notice','work','need','wait','focused',
              'influence', 'struggle','sleep','share','sharing','honest']
receiving_gifts = ['weed','sweet','ticket','bussines','send','spent','t-shirt','tiffanys','camera','getting','get',
            'money','cash','more','ring','picture','clothes']

In [10]:
#creating a regular expression string for each list of words
words_affirmation_regex = '|'.join(words_affirmation)
quality_time_regex = '|'.join(quality_time)
physical_touch_regex = '|'.join(physical_touch)
acts_service_regex = '|'.join(acts_service)
receiving_gifts_regex = '|'.join(receiving_gifts)

#creating a new column for each category of words
all_lyrics['Words of Affirmation'] = all_lyrics['new_lyrics'].str.contains(words_affirmation_regex)
all_lyrics['Quality Time'] = all_lyrics['new_lyrics'].str.contains(quality_time_regex)
all_lyrics['Physical Touch'] = all_lyrics['new_lyrics'].str.contains(physical_touch_regex)
all_lyrics['Acts of Service'] = all_lyrics['new_lyrics'].str.contains(acts_service_regex)
all_lyrics['Receiving Gifts'] = all_lyrics['new_lyrics'].str.contains(receiving_gifts_regex)

#counting the number of times each category of word appears in the lyrics
words_affirmation_count = sum(all_lyrics['Words of Affirmation'])
quality_time_count = sum(all_lyrics['Quality Time'])
physical_touch_count = sum(all_lyrics['Physical Touch'])
acts_service_count = sum(all_lyrics['Acts of Service'])
receiving_gifts_count = sum(all_lyrics['Receiving Gifts'])

#print
print("Words of Affirmation: ", words_affirmation_count)
print("Quality Time: ", quality_time_count)
print("Physical Touch: ", physical_touch_count)
print("Acts of Service: ", acts_service_count)
print("Receiving Gifts: ", receiving_gifts_count)



Words of Affirmation:  452
Quality Time:  497
Physical Touch:  377
Acts of Service:  290
Receiving Gifts:  374


Visualize how Doja Cat's lyrics love language have changed over time

In [11]:
#create a new dataframe for yearly mentions that groups mentions by year
yearly_mentions = all_lyrics.groupby('Year Released').sum().reset_index()
yearly_mentions


Unnamed: 0.1,Year Released,Unnamed: 0,Track Number,Words of Affirmation,Quality Time,Physical Touch,Acts of Service,Receiving Gifts
0,2014,9118,13865,19,59,16,12,27
1,2018,26920,30164,120,172,103,70,67
2,2019,30222,20944,86,116,92,78,91
3,2021,47217,12332,227,150,166,130,189


In [12]:
yearly_mentions.sort_values(by='Words of Affirmation', ascending=False)

Unnamed: 0.1,Year Released,Unnamed: 0,Track Number,Words of Affirmation,Quality Time,Physical Touch,Acts of Service,Receiving Gifts
3,2021,47217,12332,227,150,166,130,189
1,2018,26920,30164,120,172,103,70,67
2,2019,30222,20944,86,116,92,78,91
0,2014,9118,13865,19,59,16,12,27


In [13]:
yearly_mentions.sort_values(by='Quality Time', ascending=False)

Unnamed: 0.1,Year Released,Unnamed: 0,Track Number,Words of Affirmation,Quality Time,Physical Touch,Acts of Service,Receiving Gifts
1,2018,26920,30164,120,172,103,70,67
3,2021,47217,12332,227,150,166,130,189
2,2019,30222,20944,86,116,92,78,91
0,2014,9118,13865,19,59,16,12,27


-- Tokenize the Lyrics

In [None]:
#run this cell to tokenize the words in the clean_lyric column
all_lyrics['lyrics_tok'] = all_lyrics['new_lyrics'].apply(lambda x: word_tokenize(x))

In [None]:
#determine what words overall are the most frequently used words
#create a list of all the words in the lyrics_tok column
word_list = [word for list_ in all_lyrics['lyrics_tok'] for word in list_]

#use the counter function to count the number of times each word appears
word_frequency = collections.Counter(word_list)
#sort the word frequencies to find out the most common words she's used. 
word_frequency = sorted(word_frequency.items(), key=lambda x: x[1], reverse=True)
#call the word frequency
word_frequency

In [None]:
#create a position variable that includes both the track number and line number
## YOUR CODE HERE ##

#create a new DataFrame that is grouped by position
## YOUR CODE HERE ##
