# Lyrics Analysis
We scrapped the AZLyrics website in the previous Jupyter Notebook, and are now going to use the collected data for Lyrics analysis.

In [None]:
import pandas as pd
import numpy as np
import re
import matplotlib.pyplot as plt
import seaborn as sns
import nltk
#nltk.download('stopwords')
from nltk.stem.snowball import EnglishStemmer
from nltk.corpus import stopwords
import jellyfish
import tensorflow as tf
import tensorflow_hub as hub
from sklearn.manifold import TSNE

sns.set()

In [None]:
# Let's import and merge the csv files generated during the data collection
df= pd.DataFrame()
for i in range(6):
    try: # if file exists
        df= pd.concat([df, pd.read_csv("data/0{}_dataset_lyrics.csv".format(i+1))])
    except:
        #au cas ou , si mon fichier n'existe pas alors j'ajoute un df vide
        df= pd.concat([df, pd.DataFrame()])
        
df= df.rename({"test":"Lyrics"}, axis=1)
df.drop(columns='Url', inplace=True)
df.loc[df['Style'] == 'Rock\\Alternatif', 'Style'] = 'Rock/Alternatif' # Fixing a small issue with the dataset
df.dropna(inplace=True)

In [None]:
df.shape

In [None]:
df.head()

As we can see, we have successfully scrapped the lyrics of over 1 thousand songs. We also extracted interesting meta data which are going to enhance our analysis:
- Artist
- Year
- Album name
- Style

## 1. How many words?

In [None]:
# Splitting the lyrics into arrays of words
df['words'] = df.apply(
    lambda row: np.char.lower(np.array(re.findall(r'\w+', row['Lyrics']))),
    axis=1
)


In [None]:
# Erase the stop words
stop_words = np.array(set(stopwords.words('english')))
df['words'] = df['words'].apply(
    lambda word_array: word_array[np.logical_not(np.isin(word_array, stop_words))]
)

In [None]:
df['words']

### 1.1. Lyrics length
The first interesting insight that we can get from our dataset is the number of words used.

In [None]:
df['n_words'] = df['words'].apply(len)

In [None]:
plt.figure(figsize=(12,5))
for style in df['Style'].unique():
    words_per_year = df[df['Style'] == style].groupby('Annee')['n_words'].mean()
    sns.lineplot(words_per_year.index, words_per_year.values, label=style)
plt.title('Various styles mean number of words')
plt.ylabel('Mean number of words')
plt.xlabel('Year')
plt.legend()
plt.show()

In [None]:
fig, ax = plt.subplots(4, 1, figsize=(16, 12))
for (i, style) in enumerate(df['Style'].unique()):
    for artist in df.loc[df['Style'] == style, 'Artiste'].unique():
        words_per_year = df[df['Artiste'] == artist].groupby('Annee')['n_words'].mean()
        sns.lineplot(words_per_year.index, words_per_year.values, label=artist, ax=ax[i])
    ax[i].set_title(style)
    ax[i].set_xlabel('')
plt.suptitle('Mean number of words per artist and per style')
plt.legend()
plt.show()

As we can see, most of our music style have a homogeneous number of words around 300, except for the rap genre that is near double, around 600 words per music!

Thus, **rap music generally includes more words per song than other music styles**.

Nevertheless, **there is more heterogeneity in the rap style than in other styles**. For example, Eminem produces music with way more lyrics than Post Malone, even though they are both considered rappers.

We should now have a look at the diversity of these words.

### 1.2. Unique words
Now that we know the sizes of lyrics, let's have a look at their diversity!

In [None]:
df['n_unique_words'] = df['words'].apply(
    lambda word_list: len(set(word_list))
)

In [None]:
plt.figure(figsize=(12,5))
for style in df['Style'].unique():
    unique_words_per_year = df[df['Style'] == style].groupby('Annee')['n_unique_words'].mean()
    sns.lineplot(unique_words_per_year.index, unique_words_per_year.values, label=style)
plt.title('Various styles mean number of unique words')
plt.ylabel('Mean number of words')
plt.xlabel('Year')
plt.legend()
plt.show()

In [None]:
fig, ax = plt.subplots(4, 1, figsize=(16, 12))
for (i, style) in enumerate(df['Style'].unique()):
    for artist in df.loc[df['Style'] == style, 'Artiste'].unique():
        unique_words_per_year = df[df['Artiste'] == artist].groupby('Annee')['n_unique_words'].mean()
        sns.lineplot(unique_words_per_year.index, unique_words_per_year.values, label=artist, ax=ax[i])
    ax[i].set_title(style)
    ax[i].set_xlabel('')
plt.suptitle('Mean number of unique words per artist and per style')
plt.legend()
plt.show()

We can see here that, once again, the rap style uses more diverse words than the other music styles. This is no surprise, since it also uses more words (even non-unique) per songs, so we are going to look at the uniqueness ratio of the songs.

In [None]:
df['unique_words_ratio'] = df['n_unique_words'] / df['n_words'] # Using highly optimized Numpy broadcasting

In [None]:
plt.figure(figsize=(12,5))
for style in df['Style'].unique():
    unique_words_per_year = df[df['Style'] == style].groupby('Annee')['unique_words_ratio'].mean()
    sns.lineplot(unique_words_per_year.index, unique_words_per_year.values, label=style)
plt.title('Various styles mean ratio of unique words')
plt.ylabel('Mean number of words')
plt.xlabel('Year')
plt.legend()
plt.show()

This graph is interesting! We can see two insights here:
- Even though rap produces a higher quantity of lyrics, **the ratio of unique words is not largely higher than other genres**.
- There are visible trends in the ratio of unique words:
 - **Metal diversity of vocabulary decline sinced 1995**.
 - **Pop diversity of vocabulary is rapidly increasing**.

In [None]:
fig, ax = plt.subplots(4, 1, figsize=(16, 12))
for (i, style) in enumerate(df['Style'].unique()):
    for artist in df.loc[df['Style'] == style, 'Artiste'].unique():
        unique_words_per_year = df[df['Artiste'] == artist].groupby('Annee')['unique_words_ratio'].mean()
        sns.lineplot(unique_words_per_year.index, unique_words_per_year.values, label=artist, ax=ax[i])
    ax[i].set_title(style)
    ax[i].set_xlabel('')
plt.suptitle('Mean ratio of unique words per artist and per style')
plt.legend()
plt.show()

As we can see, the rapid increase of ratio of unique words comes from Billie Eilish's work. Is this a general trend in the pop genre, or is this a specificity of our dataset? We would need more data to confirm this.

## 2. Which words?
Now that we have analyzed the quantities of words in the various lyrics of our dataset, we are going to have a qualitative approach, and understand which words are being used.

### 2.1. Most commonly used words
First, we are going to have a look at the most commonly used words. Nevertheless, we want to consider the various declinations of a word into one single word, not many. We are thus going to stem our words.

In [None]:
# Stemming the words before counting
stemmer = EnglishStemmer()
df['stem_words'] = df['words'].apply(
    lambda word_list: [stemmer.stem(word) for word in word_list] #TODO: optimize
)

In [None]:
# Counting the used words
all_words = pd.DataFrame({'word': [word for row in df['stem_words'] for word in row], 'count': 1})
all_words = all_words.groupby('word').sum()

In [None]:
plt.figure(figsize=(14,5))
plt.stem(all_words.sort_values('count', ascending=False)[:50].index,
        all_words.sort_values('count', ascending=False)[:50])
plt.xticks(rotation=90)
plt.title('Most comonly used words')
plt.ylabel('Number of use')
plt.show()

In [None]:
fig, ax = plt.subplots(4, 1, figsize=(14,10))
for (i, style) in enumerate(df['Style'].unique()):
    # Counting the used words
    all_words = pd.DataFrame({'word': [word for row in df.loc[df['Style']==style, 'stem_words'] for word in row], 'count': 1})
    all_words = all_words.groupby('word').sum()
    ax[i].stem(
        all_words.sort_values('count', ascending=False)[:50].index,
        all_words.sort_values('count', ascending=False)[:50]
    )
    ax[i].xaxis.set_tick_params(rotation=90)
    ax[i].set_title(style)
    ax[i].set_ylabel('Number of use')
plt.suptitle('Most commonly used word per style')
plt.tight_layout()
plt.show()

### 2.2. Vulgarity

We are now going to have a look at the vulgarity of the lyrics of our dataset.

In order to do so, we are going to use a [ban word list](https://www.freewebheaders.com/bad-words-list-and-page-moderation-words-list-for-facebook/) which is used for facebook moderation.

In [None]:
ban_words = pd.read_csv("data/ban_word.csv", header= None, names= ["nono"])
ban_words = ban_words.drop(index= 0).reset_index(drop= True)
ban_words = np.array(ban_words.values)
print(f'There are {len(ban_words)} banned words in our list.')

In [None]:
# Keep only the banned_words
df['banned_words'] = df['words'].apply(
    lambda word_array: word_array[np.isin(word_array, ban_words)]
)

In [None]:
df['n_banned_words'] = df['banned_words'].apply(len)

In [None]:
plt.figure(figsize=(12,5))
for style in df['Style'].unique():
    words_per_year = df[df['Style'] == style].groupby('Annee')['n_banned_words'].mean()
    sns.lineplot(words_per_year.index, words_per_year.values, label=style)
plt.title('Mean number of banned words')
plt.ylabel('Mean number of banned words')
plt.xlabel('Year')
plt.legend()
plt.show()

As we can see, the rap music style is, once again, the highest one on the charts, but this is caused by the cheer number of word in their songs. Let's have a look at the ratio!

In [None]:
df['banned_words_ratio'] = df['n_banned_words'] / df['n_words'] # Using highly optimized Numpy broadcasting

In [None]:
plt.figure(figsize=(12,5))
for style in df['Style'].unique():
    words_per_year = df[df['Style'] == style].groupby('Annee')['banned_words_ratio'].mean()
    sns.lineplot(words_per_year.index, words_per_year.values, label=style)
plt.title('Mean ratio of banned words')
plt.ylabel('Mean ratio of banned words')
plt.xlabel('Year')
plt.legend()
plt.show()

This graph allows us to draw some conclusions:
- **Rap style includes more banned words both proportionnaly and in general**.
- Many genres seem to be reducing their use of banned words through the years.
- The **pop music style uses really few banned words**. This might be because its target audience is wider.

### 2.3. Similarity

Let's have a look at the similarity of lyrics within styles and between styles.

In [None]:
# lyrics_matrix = np.stack(
#     [
#         np.tril(
#             np.repeat(
#                 np.reshape(df['Lyrics'].values, (len(df), 1)),
#                 len(df),
#                 1
#             )
#         ),
#         np.tril(
#             np.repeat(
#                 np.reshape(df['Lyrics'].values, (1, len(df))),
#                 len(df),
#                 0
#             )
#         )
#     ],
#     axis=2
# )

In [None]:
# np.apply_along_axis(
#     lambda couple: jellyfish.levenshtein_distance(str(couple[0]), str(couple[1])),
#     axis=-1,
#     arr=lyrics_matrix,
# )

### 2.4. Vocabulary style

We are now going to try and visualize the vocabulary used by the various lyrics we collected.

In [None]:
# Using the nnlm-en-dim128 TensorFlow embedding model available at
# https://tfhub.dev/google/nnlm-en-dim128/2
# The model is > 450MB, download might take a long time (25 minutes).
# Nevertheless, tensorflow hub caches the model, so the next uses are way faster (2 seconds).
embed = hub.load("https://tfhub.dev/google/nnlm-en-dim128/2")

# Embedding of the Lyrics to a 128 dimensions dense tf.Tensor
embeddings = embed(df['Lyrics'].values)

In [None]:
# t-SNE 2 dimensional projection of the embeddings
projections = TSNE(n_components=2).fit_transform(embeddings)

In [None]:
plt.figure(figsize=(12, 8))
sns.scatterplot(x=projections[:, 0], y=projections[:, 1], hue=df['Style'].values)
plt.title('Lyrics projection by style')

We can now visualize and interpret the projections of our lyrics.

As we can see, rap lyrics are definitely different than metal lyrics, it is easily visible. On the other hand, metal and rock/alternatif lyrics seem really similar. Finally, pop lyrics seem really diversified and do not visibly differ with other music styles.

In [None]:
fig, ax = plt.subplots(2, 2, figsize=(12, 12), sharex='all', sharey='all')
for (i, style) in enumerate(df['Style'].unique()):
    mask = df['Style'] == style
    sns.scatterplot(
        x=projections[mask, 0],
        y=projections[mask, 1],
        hue=df.loc[mask, 'Artiste'].values,
        ax=ax.flat[i]
    )
    ax.flat[i].set_title(style)
plt.suptitle('Lyrics projection by artist')
plt.tight_layout()
plt.show()

We can dig deeper and visualize artist by artist.

One good application would be to use this dataset to predict music style / artist when given a lyrics.

## 3. Sentiment analysis

# TODO

- Sentiment score