# Exploratory Data Analysis, Sentiment Analysis and Topic Modelling
---

In [None]:
#Importing modules and data
import pandas as pd
import string
import numpy as np
import matplotlib.pyplot as plt 
import seaborn as sns
%matplotlib inline
import os
import datetime as dt  
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = 'all'
plt.rcParams['figure.figsize'] = [16, 10]
plt.rcParams['font.size'] = 15
width = 0.75
sns.set_palette(sns.color_palette('tab20', 20))
import plotly.graph_objs as go
from datetime import date, timedelta
from empath import Empath
lexicon = Empath()
import math
from textblob import TextBlob
from detoxify import Detoxify
import chart_studio.plotly as py
from plotly.offline import iplot
import plotly.express as px
import plotly.figure_factory as ff
import cufflinks
cufflinks.go_offline()
cufflinks.set_config_file(world_readable=True, theme='pearl')

from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer

data = pd.read_csv('/Users/qab/Desktop/Personal/NLP Projects/Context Maturity (NLP)/Data/jon_bellion.csv')

In [None]:
data.head()

## Drawing Empath Themes

In [None]:
#Finds the empath themes in the lyrics
def extract_empath(lyrics):
    return lexicon.analyze(lyrics)

#Creates tags with the empath themes based on score
def make_tags(tags):
    tgs = [k for k, v in tags.items() if v != 0] #Helps set limit on tags to be kept
    #tgs = sorted(tags.items(), key = lambda x: x[1], reverse = True)
    return tgs

#Processes the dictionary of tags and keeps the keys
def process(st):
    st = str(st)
    table = str.maketrans(dict.fromkeys(string.punctuation))
    new_s = st.translate(table)
    return new_s

data['empath_themes'] = data['lyrics'].apply(extract_empath).apply(make_tags).apply(process).apply(lambda x: ''.join(x))

In [None]:
data.head(10)

In [None]:
#Number of songs per album
data.groupby('album').count()['titles'].sort_values(ascending=False).iplot(
    kind='bar', 
    yTitle='Number of songs', 
    linecolor='black', 
    opacity=0,
    title='Bar chart of songs per album release', 
    xTitle='Albums'
    )

## Word count of lyrics before and after text preprocessing.

In [None]:
# Word count
word_count = lambda x: len(str(x).split())
data['song_length'] = data['lyrics'].astype(str).apply(len)
data['lyrics_word_count'] = data['lyrics'].apply(word_count)
data['processed_lyrics_word_count'] = data['processed_lyrics'].apply(word_count)
data.head()

In [None]:
# Length of all songs per album for unprocessed lyrics
sns.set(rc = {'figure.figsize':(15, 10)})
album1 = data[data['album'] == 'translation_through_speakers']['lyrics'].str.len()
sns.distplot(album1, label = 'Translation Through Speakers')
album2 = data[data['album'] == 'the_separation']['lyrics'].str.len()
sns.distplot(album2, label = 'The Separation')
album3 = data[data['album'] == 'the_definition']['lyrics'].str.len()
sns.distplot(album3, label = 'The Definition')
album4 = data[data['album'] == 'the_human_condition']['lyrics'].str.len()
sns.distplot(album4, label = 'The Human Condition')
album5 = data[data['album'] == 'glory_sound_prep']['lyrics'].str.len()
sns.distplot(album5, label = 'Glory Sound Prep')
plt.title('Length of lyrics per album released (Lyrics Not Processed)')
plt.legend();

In [None]:
# Length of all songs per album for processed lyrics
sns.set(rc = {'figure.figsize':(15, 10)})
album1 = data[data['album'] == 'translation_through_speakers']['processed_lyrics'].str.len()
sns.distplot(album1, label = 'Translation Through Speakers')
album2 = data[data['album'] == 'the_separation']['processed_lyrics'].str.len()
sns.distplot(album2, label = 'The Separation')
album3 = data[data['album'] == 'the_definition']['processed_lyrics'].str.len()
sns.distplot(album3, label = 'The Definition')
album4 = data[data['album'] == 'the_human_condition']['processed_lyrics'].str.len()
sns.distplot(album4, label = 'The Human Condition')
album5 = data[data['album'] == 'glory_sound_prep']['processed_lyrics'].str.len()
sns.distplot(album5, label = 'Glory Sound Prep')
plt.title('Length of lyrics per album released (Pre-Processed Lyrics)')
plt.legend();

In [None]:
#Distribution of songs and their length
data['song_length'].iplot(
    kind = 'hist',
    bins = 56, #A small database so I limited this to the number of rows so the spread will be even. 
    xTitle = 'Song Length',
    linecolor = 'black',
    yTitle = 'Number of Songs',
    title = 'Song Length Distribution')

In [None]:
# Word count of lyrics before and after cleaning
data[['lyrics_word_count', 'processed_lyrics_word_count']].iplot(
    kind = 'hist',
    bins = 20, #A small database so I limited this to the number of rows so the spread will be even. 
    xTitle = 'Word Count',
    linecolor = 'black',
    yTitle = 'Number of Songs',
    title = 'Lyrics Count Per Song Distribution Before and After Pre-processing')

## Sentiment, Toxicity and Subjectivity.

In [None]:
#Setting polarity and subjectivity with Textblob
pol = lambda x: TextBlob(x).sentiment.polarity
sub = lambda x: TextBlob(x).sentiment.subjectivity

#Creating new features for polarity and subjectivity
data['polarity'] = data['lyrics'].apply(pol)
data['subjectivity'] = data['lyrics'].apply(sub)
data.head()

In [None]:
data.describe()

## Toxicity

In [None]:
toxicity_dict = []
for lyric in data['lyrics']:
    toxicity = Detoxify('original').predict(lyric)
    toxicity_dict.append(toxicity)

In [None]:
#toxicity_dict
toxicity_df = pd.DataFrame(toxicity_dict)
#toxicity_df
data = pd.concat([data, toxicity_df], axis = 1)
#data.to_csv('Jon Bellion Discography DS Metadata.csv')
data.head()

## Visualising the sentiment and toxicity of songs

In [None]:
# Sentiment polarity distribution - shows polarity range and number of songs in that range
data['polarity'].iplot(
    kind = 'hist',
    bins = 56, #A small database so I limited this to the number of rows so the spread will be even. 
    xTitle = 'Lyrics Polarity',
    linecolor = 'black',
    yTitle = 'Number of Songs',
    title = 'Sentiment Polarity Distribution')

In [None]:
# Subjectivity distribution - shows subjectivity range and number of songs in that range
data['toxicity'].iplot(
    kind = 'hist',
    bins = 56, #A small database so I limited this to the number of rows so the spread will be even. 
    xTitle = 'Lyrics Toxicity',
    linecolor = 'black',
    yTitle = 'Number of Songs',
    title = 'Toxiciy Distribution')

In [None]:
# A 2D density jointplot comparing sentiment and the toxicity of the songs
trace1 = go.Scatter(
    x=data['polarity'], y=data['toxicity'], mode='markers', name='points',
    marker=dict(color='rgb(102,0,0)', size=2, opacity=0.4)
)
trace2 = go.Histogram2dContour(
    x=data['polarity'], y=data['toxicity'], name='density', ncontours=20,
    colorscale='Hot', reversescale=True, showscale=False
)
trace3 = go.Histogram(
    x=data['polarity'], name='Sentiment polarity density',
    marker=dict(color='rgb(102,0,0)'),
    yaxis='y2'
)
trace4 = go.Histogram(
    y=data['toxicity'], name='Song Toxicity density', marker=dict(color='rgb(102,0,0)'),
    xaxis='x2'
)
plot_data = [trace1, trace2, trace3, trace4]

layout = go.Layout(
    showlegend=False,
    autosize=False,
    width=600,
    height=550,
    xaxis=dict(
        domain=[0, 0.85],
        showgrid=False,
        zeroline=False
    ),
    yaxis=dict(
        domain=[0, 0.85],
        showgrid=False,
        zeroline=False
    ),
    margin=dict(
        t=50
    ),
    hovermode='closest',
    bargap=0,
    xaxis2=dict(
        domain=[0.85, 1],
        showgrid=False,
        zeroline=False
    ),
    yaxis2=dict(
        domain=[0.85, 1],
        showgrid=False,
        zeroline=False
    )
)

fig = go.Figure(data=plot_data, layout=layout)
iplot(fig, filename='2dhistogram-2d-density-plot-subplots')

In [None]:
fig = px.scatter(
data, 
x = "polarity", 
y = "toxicity",
labels = {'polarity': 'Polarity',
'toxicity': 'Toxicity', 
'album': 'Albums'},
hover_name = 'titles',
title = 'Polarity and Toxicity Plot',
width=1000,
height=800,
color = 'album'
)
fig.show()

## Drill down analysis of sentiment throughout the progression of the song

In [None]:
# A function to split lyrics into 'n' number of chunks
def split_text(text, n = 5):
    '''Takes in a string of text(lyrics) and splits into n equal parts, with a default of 10 equal parts.'''
    
    # Calculate length of text, the size of each chunk of text and the starting points of each chunk of text
    length = len(text)
    size = math.floor(length / n)
    start = np.arange(0, length, size)
    
    # Pull out equally sized pieces of text and put it into a list
    split_list = []
    for piece in range(n):
        split_list.append(text[start[piece]:start[piece]+size])
    return split_list

In [None]:
#Trying it out
list_pieces = []
for t in data.processed_lyrics:
    split = split_text(t)
    list_pieces.append(split)
    
#list_pieces

In [None]:
#Checking the polarity for the pieces of lyric chunks
polarity_lyrics = []
for lp in list_pieces:
    polarity_piece = []
    for p in lp:
        polarity_piece.append(TextBlob(p).sentiment.polarity)
    polarity_lyrics.append(polarity_piece)
    
#polarity_lyrics

In [None]:
#Plotting sentment changes for the chunck of texts (lyrics)
plt.plot(polarity_lyrics[1])
plt.title(data['titles'].index[1])
plt.show()

In [None]:
# Plotting for all songs
plt.rcParams['figure.figsize'] = [50, 40]

for index, title in enumerate(data.index):    
    plt.subplot(8, 7, index+1)
    plt.plot(polarity_lyrics[index])
    plt.plot(np.arange(0, 5), np.zeros(5))
    plt.title(data['titles'][index], fontsize = 18)
    plt.ylim(ymin=-1, ymax=1)
    plt.xlim(xmin=0, xmax=4)
    
plt.show()

In [None]:
# A boxplot showcasing the sentiment range for the albums
y0 = data.loc[data['album'] == 'translation_through_speakers']['polarity']
y1 = data.loc[data['album'] == 'the_separation']['polarity']
y2 = data.loc[data['album'] == 'the_definition']['polarity']
y3 = data.loc[data['album'] == 'the_human_condition']['polarity']
y4 = data.loc[data['album'] == 'glory_sound_prep']['polarity']

trace0 = go.Box(
    y = y0,
    name = 'Translation Through Speakers',
    marker = dict(
        color = 'rgb(214, 12, 140)',
    )
)

trace1 = go.Box(
    y = y1,
    name = 'The Separation',
    marker = dict(
        color = 'rgb(0, 128, 128)',
    )
)

trace2 = go.Box(
    y = y2,
    name = 'The Definition',
    marker = dict(
        color = 'rgb(10, 140, 208)',
    )
)

trace3 = go.Box(
    y = y3,
    name = 'The Human Condition',
    marker = dict(
        color = 'rgb(12, 102, 14)',
    )
)

trace4 = go.Box(
    y = y4,
    name = 'Glory Sound Prep',
    marker = dict(
        color = 'rgb(100, 0, 10)',
    )
)

box_data = [trace0, trace1, trace2, trace3, trace4]

layout = go.Layout(
    title = 'Sentiment Polarity Box Plot for Jon Bellion Albums'
)

fig = go.Figure(data = box_data, layout = layout)
iplot(fig, filename = 'Sentiment Polarity Box Plot for Jon Bellion Albums')

In [None]:
fig = px.strip(data,
x = data.album,
y = data.polarity.round(3),
hover_name = 'titles',
color = 'album'
)
fig.show()

In [None]:
# A boxplot showcasing the toxicity range for the albums
y0 = data.loc[data['album'] == 'translation_through_speakers']['toxicity']
y1 = data.loc[data['album'] == 'the_separation']['toxicity']
y2 = data.loc[data['album'] == 'the_definition']['toxicity']
y3 = data.loc[data['album'] == 'the_human_condition']['toxicity']
y4 = data.loc[data['album'] == 'glory_sound_prep']['toxicity']

trace0 = go.Box(
    y = y0,
    name = 'Translation Through Speakers',
    marker = dict(
        color = 'rgb(214, 12, 140)',
    )
)

trace1 = go.Box(
    y = y1,
    name = 'The Separation',
    marker = dict(
        color = 'rgb(0, 128, 128)',
    )
)

trace2 = go.Box(
    y = y2,
    name = 'The Definition',
    marker = dict(
        color = 'rgb(10, 140, 208)',
    )
)

trace3 = go.Box(
    y = y3,
    name = 'The Human Condition',
    marker = dict(
        color = 'rgb(12, 102, 14)',
    )
)

trace4 = go.Box(
    y = y4,
    name = 'Glory Sound Prep',
    marker = dict(
        color = 'rgb(100, 0, 10)',
    )
)

box_data = [trace0, trace1, trace2, trace3, trace4]

layout = go.Layout(
    title = 'Toxicity Box Plot for Jon Bellion Albums'
)

fig = go.Figure(data = box_data, layout = layout)
iplot(fig, filename = 'Toxicity Box Plot for Jon Bellion Albums')

In [None]:
data.columns

In [None]:
#Getting top words
def get_top_n_words(corpus, n = None):
    vec = CountVectorizer().fit(corpus)
    bag_of_words = vec.transform(corpus)
    sum_words = bag_of_words.sum(axis = 0) 
    words_freq = [(word, sum_words[0, idx]) for word, idx in vec.vocabulary_.items()]
    words_freq = sorted(words_freq, key = lambda x: x[1], reverse=True)
    return words_freq[:n]
common_words = get_top_n_words(data['processed_lyrics'], 20)

for word, freq in common_words:
    print(word, freq)
df1 = pd.DataFrame(common_words, columns = ['processed_lyrics' , 'count'])

In [None]:
#Plotting the top words and their frequency
df1.groupby('processed_lyrics').sum()['count'].sort_values(ascending=False).iplot(
    kind='bar',
    yTitle='Count', 
    linecolor='black', 
    title='Top 20 words in lyrics')

In [None]:
#Getting top number of bi-gramns
def get_top_n_bigram(corpus, n=None):
    vec = CountVectorizer(ngram_range=(2, 2)).fit(corpus)
    bag_of_words = vec.transform(corpus)
    sum_words = bag_of_words.sum(axis=0) 
    words_freq = [(word, sum_words[0, idx]) for word, idx in vec.vocabulary_.items()]
    words_freq = sorted(words_freq, key = lambda x: x[1], reverse=True)
    return words_freq[:n]

common_words = get_top_n_bigram(data['processed_lyrics'], 20)
for word, freq in common_words:
    print(word, freq)
    
df3 = pd.DataFrame(common_words, columns = ['processed_lyrics' , 'count'])

In [None]:
#Plotting bi-grams frequency
df3.groupby('processed_lyrics').sum()['count'].sort_values(ascending=False).iplot(
    kind='bar', 
    yTitle='Count', 
    linecolor='black', 
    title='Top 20 bigrams in lyrics')

## Topic Modelling
___

In [None]:
count_vectorizer = CountVectorizer(stop_words = 'english', max_features = 40000)
lyrics_data = data.processed_lyrics.values

print('LYRICS BEFORE VECTORIZATION: {}'.format(lyrics_data[45]))

document_term_matrix = count_vectorizer.fit_transform(lyrics_data)

print('LYRICS AFTER VECTORIZATION: \n{}'.format(document_term_matrix[45]))

In [None]:
from sklearn.decomposition import TruncatedSVD
num_of_topics = 10

lsa_model = TruncatedSVD(n_components = num_of_topics)
lsa_topic_matrix = lsa_model.fit_transform(document_term_matrix)

In [None]:
def get_keys(topic_matrix):
    '''
    This returns an integer list of predicted topic 
    categories for a given topic matrix
    '''
    keys = topic_matrix.argmax(axis=1).tolist()
    return keys

def keys_to_counts(keys):
    '''
    This returns a tuple of topic categories and their 
    accompanying magnitudes for a given list of keys
    '''
    count_pairs = Counter(keys).items()
    categories = [pair[0] for pair in count_pairs]
    counts = [pair[1] for pair in count_pairs]
    return (categories, counts)

In [None]:
from collections import Counter
lsa_keys = get_keys(lsa_topic_matrix)
lsa_categories, lsa_counts = keys_to_counts(lsa_keys)

In [None]:
def find_top_n_words(n, keys, document_term_matrix, count_vectorizer):
    '''
    This returns a list of n_topic strings, where each string contains the n most common 
    words in a predicted category, in order
    '''
    top_word_indices = []
    for topic in range(num_of_topics):
        temp_vector_sum = 0
        for i in range(len(keys)):
            if keys[i] == topic:
                temp_vector_sum += document_term_matrix[i]
        temp_vector_sum = temp_vector_sum.toarray()
        top_n_word_indices = np.flip(np.argsort(temp_vector_sum)[0][-n:],0)
        top_word_indices.append(top_n_word_indices)   
    top_words = []
    for topic in top_word_indices:
        topic_words = []
        for index in topic:
            temp_word_vector = np.zeros((1,document_term_matrix.shape[1]))
            temp_word_vector[:,index] = 1
            the_word = count_vectorizer.inverse_transform(temp_word_vector)[0][0]
            topic_words.append(the_word.encode('ascii').decode('utf-8'))
        top_words.append(" ".join(topic_words))         
    return top_words

In [None]:
top_n_words_lsa = find_top_n_words(10, lsa_keys, document_term_matrix, count_vectorizer)

for i in range(len(top_n_words_lsa)):
    print("Topic {}: ".format(i+1), top_n_words_lsa[i])

In [None]:
topic_labels = ['Loss and Misery',
'Expressing Love & Desperation',
'Contemplation',
'Comfort & Accomplishments',
'Youthfulness',
'Bonds & Relationships',
'Struggle & Turmoil',
'Nurturing',
'Contentment',
'Dream & Fantasies']

In [None]:
top_5_words = find_top_n_words(5, lsa_keys, document_term_matrix, count_vectorizer)
labels = ['Topic {}: \n'.format(i) + topic_labels[i] for i in lsa_categories]

fig, ax = plt.subplots(figsize=(40,20))
ax.bar(lsa_categories, lsa_counts);
ax.set_xticks(lsa_categories);
ax.set_xticklabels(labels);
ax.set_title('LSA TOPIC COUNT');
ax.set_ylabel('NUMBER OF SONGS');

In [None]:
from sklearn.manifold import TSNE
tsne_lsa_model = TSNE(n_components = 2, perplexity = 50, learning_rate = 100, n_iter = 2000, verbose = 1, random_state = 0, angle = 0.75)
tsne_lsa_vectors = tsne_lsa_model.fit_transform(lsa_topic_matrix) 

In [None]:
def get_mean_topic_vectors(keys, two_dim_vectors):
    '''
    This returns a list of centroid vectors from each predicted topic category
    '''
    mean_topic_vectors = []
    for t in range(num_of_topics):
        articles_in_that_topic = []
        for i in range(len(keys)):
            if keys[i] == t:
                articles_in_that_topic.append(two_dim_vectors[i])    
        
        articles_in_that_topic = np.vstack(articles_in_that_topic)
        mean_article_in_that_topic = np.mean(articles_in_that_topic, axis=0)
        mean_topic_vectors.append(mean_article_in_that_topic)
    return mean_topic_vectors

In [None]:
colormap = np.array([
    "#1f77b4", "#aec7e8", "#ff7f0e", "#ffbb78", "#2ca02c",
    "#98df8a", "#d62728", "#ff9896", "#9467bd", "#c5b0d5",
    "#8c564b", "#c49c94", "#e377c2", "#f7b6d2", "#7f7f7f",
    "#c7c7c7", "#bcbd22", "#dbdb8d", "#17becf", "#9edae5" ])
colormap = colormap[:num_of_topics]

In [None]:
from bokeh.plotting import figure, output_file, show
from bokeh.models import Label
from bokeh.io import output_notebook
output_notebook()

top_5_words_lda = find_top_n_words(5, lsa_keys, document_term_matrix, count_vectorizer)
lsa_mean_topic_vectors = get_mean_topic_vectors(lsa_keys, tsne_lsa_vectors)

plot = figure(title="t-SNE CLUSTERING OF {} LSA LYRICS TOPICS".format(num_of_topics), plot_width=1200, plot_height=500)
plot.scatter(x=tsne_lsa_vectors[:,0], y=tsne_lsa_vectors[:,1], color=colormap[lsa_keys])

for t in range(num_of_topics):
    label = Label(x=lsa_mean_topic_vectors[t][0], y=lsa_mean_topic_vectors[t][1], 
                  text=topic_labels[t], text_color=colormap[t])
    plot.add_layout(label)

show(plot)

In [None]:
data['Topics'] = lsa_keys

In [None]:
data.head()

In [None]:
# data.to_csv('final_jon_bellion_data.csv')

## Search for songs based on a given word

In [None]:
import nltk
from nltk.corpus import wordnet
nltk.download('wordnet', quiet = True)
import inflect
def find_synonym(word_entry):
    found_synonyms = []
    for synonym in wordnet.synsets(word_entry):
        for lemma in synonym.lemmas():
            found_synonyms.append(lemma.name())
    return list(set(found_synonyms))

def song_search(value, df, toggle):
    if value == '':
        if df.shape[0] < 25:
            return df, df.shape[0]
        return df[:], df.shape[0]
    df_out = pd.DataFrame({'titles': [], 'album': [], 'date_released': [], 'lyrics': []})
    if toggle == 'Lyrics':
        x = []
        for i in value.split():
            x.extend(find_synonym(i)) # + find_synonym(p.plural(i)))
        if x == []:
            df_out = df.loc[df['processed_lyrics'].str.contains(value, case = False, na = False)]
        else:
            reg = ' | '.join(x)
            reg = ' ' + reg
            df_out = df.loc[df['processed_lyrics'].str.contains(reg, regex = True, case = False, na = False)]

    if toggle == 'Tags':
        x = value.split(' ')
        reg = ' | '.join(x)
        df_out = df.loc[df['empath_themes'].str.contains(' ' + reg, regex = True, case = False, na = False)]

    if df_out.shape[0] < 25:
        return df_out, df_out.shape[0]
    return df_out[:], df_out.shape[0]

In [None]:
import ipywidgets as widgets
search_toggle = widgets.Dropdown(
    options = ['Lyrics', 'Tags'],
    description = 'Search:',
    disabled = False
)

search = widgets.Text(
    placeholder = 'Type Something',
    description = 'Search:',
    disabled = False
)

display(search_toggle)
display(search)

In [None]:
from IPython.display import HTML, display
fr_sh, size = song_search(search.value, data, search_toggle.value)
df = fr_sh[['titles', 'album']].to_html(escape = False, index = False)
print('Number of songs about', search.value, 'is: ' + str(size))
display(HTML(df))