# Exploratory Data Analysis
---

In [None]:
#Importing modules and data
import pandas as pd
import string
import numpy as np
import matplotlib.pyplot as plt 
import seaborn as sns
%matplotlib inline
import os
import datetime as dt 
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = 'all'
plt.rcParams['figure.figsize'] = [16, 10]
plt.rcParams['font.size'] = 15
width = 0.75
from wordcloud import WordCloud
sns.set_palette(sns.color_palette('tab20', 20))
import plotly.graph_objs as go
from datetime import date, timedelta
from empath import Empath
lexicon = Empath()
import math
from textblob import TextBlob

import chart_studio.plotly as py
from plotly.offline import iplot
import plotly.figure_factory as ff
InteractiveShell.ast_node_interactivity = 'all'
import cufflinks
cufflinks.go_offline()
cufflinks.set_config_file(world_readable=True, theme='pearl')

data = pd.read_csv('/Users/qab/Desktop/Personal/NLP Projects/Context Maturity (NLP)/Data/jon_bellion.csv')

In [None]:
data.head()

## Drawing Empath Themes

In [None]:
#Finds the empath themes in the lyrics
def extract_empath(lyrics):
    return lexicon.analyze(lyrics)

#Creates tags with the empath themes based on score
def make_tags(tags):
    tgs = [k for k, v in tags.items() if v != 0] #Helps set limit on tags to be kept
    #tgs = sorted(tags.items(), key = lambda x: x[1], reverse = True)
    return tgs

#Processes the dictionary of tags and keeps the keys
def process(st):
    st = str(st)
    table = str.maketrans(dict.fromkeys(string.punctuation))
    new_s = st.translate(table)
    return new_s

data['empath_themes'] = data['lyrics'].apply(extract_empath).apply(make_tags).apply(process).apply(lambda x: ''.join(x))

In [None]:
data.head(10)

In [None]:
#Number of songs per album
data.groupby('album').count()['titles'].sort_values(ascending=False).iplot(
    kind='bar', 
    yTitle='Number of songs', 
    linecolor='black', 
    opacity=0,
    title='Bar chart of songs per album release', 
    xTitle='Albums'
    )

## Word count of lyrics before and after text preprocessing.

In [None]:
# Word count
word_count = lambda x: len(str(x).split())
data['song_length'] = data['lyrics'].astype(str).apply(len)
data['lyrics_word_count'] = data['lyrics'].apply(word_count)
data['processed_lyrics_word_count'] = data['processed_lyrics'].apply(word_count)
data.head()

In [None]:
# Length of all songs per album for unprocessed lyrics
sns.set(rc = {'figure.figsize':(15, 10)})
album1 = data[data['album'] == 'translation_through_speakers']['lyrics'].str.len()
sns.distplot(album1, label = 'Translation Through Speakers')
album2 = data[data['album'] == 'the_separation']['lyrics'].str.len()
sns.distplot(album2, label = 'The Separation')
album3 = data[data['album'] == 'the_definition']['lyrics'].str.len()
sns.distplot(album3, label = 'The Definition')
album4 = data[data['album'] == 'the_human_condition']['lyrics'].str.len()
sns.distplot(album4, label = 'The Human Condition')
album5 = data[data['album'] == 'glory_sound_prep']['lyrics'].str.len()
sns.distplot(album5, label = 'Glory Sound Prep')
plt.title('Length of lyrics per album released (Lyrics Not Processed)')
plt.legend();

In [None]:
# Length of all songs per album for processed lyrics
sns.set(rc = {'figure.figsize':(15, 10)})
album1 = data[data['album'] == 'translation_through_speakers']['processed_lyrics'].str.len()
sns.distplot(album1, label = 'Translation Through Speakers')
album2 = data[data['album'] == 'the_separation']['processed_lyrics'].str.len()
sns.distplot(album2, label = 'The Separation')
album3 = data[data['album'] == 'the_definition']['processed_lyrics'].str.len()
sns.distplot(album3, label = 'The Definition')
album4 = data[data['album'] == 'the_human_condition']['processed_lyrics'].str.len()
sns.distplot(album4, label = 'The Human Condition')
album5 = data[data['album'] == 'glory_sound_prep']['processed_lyrics'].str.len()
sns.distplot(album5, label = 'Glory Sound Prep')
plt.title('Length of lyrics per album released (Pre-Processed Lyrics)')
plt.legend();

In [None]:
#Distribution of songs and their length
data['song_length'].iplot(
    kind = 'hist',
    bins = 56, #A small database so I limited this to the number of rows so the spread will be even. 
    xTitle = 'Song Length',
    linecolor = 'black',
    yTitle = 'Number of Songs',
    title = 'Song Length Distribution')

In [None]:
# Word count of lyrics before and after cleaning
data[['lyrics_word_count', 'processed_lyrics_word_count']].iplot(
    kind = 'hist',
    bins = 20, #A small database so I limited this to the number of rows so the spread will be even. 
    xTitle = 'Word Count',
    linecolor = 'black',
    yTitle = 'Number of Songs',
    title = 'Lyrics Count Per Song Distribution Before and After Pre-processing')

## Sentiment and Subjectivity

In [None]:
#Setting polarity and subjectivity with Textblob
pol = lambda x: TextBlob(x).sentiment.polarity
sub = lambda x: TextBlob(x).sentiment.subjectivity

#Creating new features for polarity and subjectivity
data['polarity'] = data['lyrics'].apply(pol)
data['subjectivity'] = data['lyrics'].apply(sub)
data.head()

In [None]:
data.describe()

## Visualising the sentiment and subectivity of songs

In [None]:
# Sentiment polarity distribution - shows polarity range and number of songs in that range
data['polarity'].iplot(
    kind = 'hist',
    bins = 56, #A small database so I limited this to the number of rows so the spread will be even. 
    xTitle = 'Lyrics Polarity',
    linecolor = 'black',
    yTitle = 'Number of Songs',
    title = 'Sentiment Polarity Distribution')

In [None]:
# Subjectivity distribution - shows subjectivity range and number of songs in that range
data['subjectivity'].iplot(
    kind = 'hist',
    bins = 56, #A small database so I limited this to the number of rows so the spread will be even. 
    xTitle = 'Lyrics Subjectivity',
    linecolor = 'black',
    yTitle = 'Number of Songs',
    title = 'Subjectivity Distribution')

In [None]:
# A 2D density jointplot comparing sentiment and the length of the songs
trace1 = go.Scatter(
    x=data['polarity'], y=data['song_length'], mode='markers', name='points',
    marker=dict(color='rgb(102,0,0)', size=2, opacity=0.4)
)
trace2 = go.Histogram2dContour(
    x=data['polarity'], y=data['song_length'], name='density', ncontours=20,
    colorscale='Hot', reversescale=True, showscale=False
)
trace3 = go.Histogram(
    x=data['polarity'], name='Sentiment polarity density',
    marker=dict(color='rgb(102,0,0)'),
    yaxis='y2'
)
trace4 = go.Histogram(
    y=data['song_length'], name='Song Length density', marker=dict(color='rgb(102,0,0)'),
    xaxis='x2'
)
plot_data = [trace1, trace2, trace3, trace4]

layout = go.Layout(
    showlegend=False,
    autosize=False,
    width=600,
    height=550,
    xaxis=dict(
        domain=[0, 0.85],
        showgrid=False,
        zeroline=False
    ),
    yaxis=dict(
        domain=[0, 0.85],
        showgrid=False,
        zeroline=False
    ),
    margin=dict(
        t=50
    ),
    hovermode='closest',
    bargap=0,
    xaxis2=dict(
        domain=[0.85, 1],
        showgrid=False,
        zeroline=False
    ),
    yaxis2=dict(
        domain=[0.85, 1],
        showgrid=False,
        zeroline=False
    )
)

fig = go.Figure(data=plot_data, layout=layout)
iplot(fig, filename='2dhistogram-2d-density-plot-subplots')

In [None]:
# A 2D density jointplot comparing subjectivity and the length of the songs
trace1 = go.Scatter(
    x=data['polarity'], y=data['subjectivity'], mode='markers', name='points',
    marker=dict(color='rgb(102,0,0)', size=2, opacity=0.4)
)
trace2 = go.Histogram2dContour(
    x=data['polarity'], y=data['subjectivity'], name='density', ncontours=20,
    colorscale='Hot', reversescale=True, showscale=False
)
trace3 = go.Histogram(
    x=data['polarity'], name='Sentiment polarity density',
    marker=dict(color='rgb(102,0,0)'),
    yaxis='y2'
)
trace4 = go.Histogram(
    y=data['subjectivity'], name='Subjectivity density', marker=dict(color='rgb(102,0,0)'),
    xaxis='x2'
)
plot_data = [trace1, trace2, trace3, trace4]

layout = go.Layout(
    showlegend=False,
    autosize=False,
    width=600,
    height=550,
    xaxis=dict(
        domain=[0, 0.85],
        showgrid=False,
        zeroline=False
    ),
    yaxis=dict(
        domain=[0, 0.85],
        showgrid=False,
        zeroline=False
    ),
    margin=dict(
        t=50
    ),
    hovermode='closest',
    bargap=0,
    xaxis2=dict(
        domain=[0.85, 1],
        showgrid=False,
        zeroline=False
    ),
    yaxis2=dict(
        domain=[0.85, 1],
        showgrid=False,
        zeroline=False
    )
)

fig = go.Figure(data=plot_data, layout=layout)
iplot(fig, filename='2dhistogram-2d-density-plot-subplots')

In [None]:
#Visualising polarity against subjectivity
plt.rcParams['figure.figsize'] = [15, 10]

for index, title in enumerate(data.index):
    x = data.polarity.loc[title]
    y = data.subjectivity.loc[title]
    plt.scatter(x, y, color='blue')
    plt.text(x+.005, y+.005, data['titles'][index], fontsize=10)
    plt.xlim(-1, 1)
    plt.ylim(0, 1)
    
plt.title('Sentiment and Subjectivity Plot', fontsize=25)
plt.xlabel('<-- Negative -------- Positive -->', fontsize=15)
plt.ylabel('<-- Facts -------- Opinions -->', fontsize=15)

plt.show()

## Drill down analysis of sentiment throughout the progression of the song

In [None]:
# A function to split lyrics into 'n' number of chunks
def split_text(text, n = 5):
    '''Takes in a string of text(lyrics) and splits into n equal parts, with a default of 10 equal parts.'''
    
    # Calculate length of text, the size of each chunk of text and the starting points of each chunk of text
    length = len(text)
    size = math.floor(length / n)
    start = np.arange(0, length, size)
    
    # Pull out equally sized pieces of text and put it into a list
    split_list = []
    for piece in range(n):
        split_list.append(text[start[piece]:start[piece]+size])
    return split_list

In [None]:
#Trying it out
list_pieces = []
for t in data.processed_lyrics:
    split = split_text(t)
    list_pieces.append(split)
    
#list_pieces

In [None]:
#Checking the polarity for the pieces of lyric chunks
polarity_lyrics = []
for lp in list_pieces:
    polarity_piece = []
    for p in lp:
        polarity_piece.append(TextBlob(p).sentiment.polarity)
    polarity_lyrics.append(polarity_piece)
    
#polarity_lyrics

In [None]:
#Plotting sentment changes for the chunck of texts (lyrics)
plt.plot(polarity_lyrics[1])
plt.title(data['titles'].index[1])
plt.show()

In [None]:
# Plotting for all songs
plt.rcParams['figure.figsize'] = [50, 40]

for index, title in enumerate(data.index):    
    plt.subplot(8, 7, index+1)
    plt.plot(polarity_lyrics[index])
    plt.plot(np.arange(0, 5), np.zeros(5))
    plt.title(data['titles'][index], fontsize = 18)
    plt.ylim(ymin=-1, ymax=1)
    plt.xlim(xmin=0, xmax=4)
    
plt.show()

In [None]:
data.columns

In [None]:
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer

#Getting top words
def get_top_n_words(corpus, n=None):
    vec = CountVectorizer().fit(corpus)
    bag_of_words = vec.transform(corpus)
    sum_words = bag_of_words.sum(axis = 0) 
    words_freq = [(word, sum_words[0, idx]) for word, idx in vec.vocabulary_.items()]
    words_freq = sorted(words_freq, key = lambda x: x[1], reverse=True)
    return words_freq[:n]
common_words = get_top_n_words(data['processed_lyrics'], 20)
for word, freq in common_words:
    print(word, freq)
df1 = pd.DataFrame(common_words, columns = ['processed_lyrics' , 'count'])

In [None]:
#Plotting the top words and their frequency
df1.groupby('processed_lyrics').sum()['count'].sort_values(ascending=False).iplot(
    kind='bar',
    yTitle='Count', 
    linecolor='black', 
    title='Top 20 words in lyrics')

In [None]:
#Getting top number of bi-gramns
def get_top_n_bigram(corpus, n=None):
    vec = CountVectorizer(ngram_range=(2, 2)).fit(corpus)
    bag_of_words = vec.transform(corpus)
    sum_words = bag_of_words.sum(axis=0) 
    words_freq = [(word, sum_words[0, idx]) for word, idx in vec.vocabulary_.items()]
    words_freq =sorted(words_freq, key = lambda x: x[1], reverse=True)
    return words_freq[:n]

common_words = get_top_n_bigram(data['processed_lyrics'], 20)
for word, freq in common_words:
    print(word, freq)
    
df3 = pd.DataFrame(common_words, columns = ['processed_lyrics' , 'count'])

In [None]:
#Plotting bi-grams frequency
df3.groupby('processed_lyrics').sum()['count'].sort_values(ascending=False).iplot(
    kind='bar', 
    yTitle='Count', 
    linecolor='black', 
    title='Top 20 bigrams in lyrics')

In [None]:
# A boxplot showcasing the sentiment range for the albums
y0 = data.loc[data['album'] == 'translation_through_speakers']['polarity']
y1 = data.loc[data['album'] == 'the_separation']['polarity']
y2 = data.loc[data['album'] == 'the_definition']['polarity']
y3 = data.loc[data['album'] == 'the_human_condition']['polarity']
y4 = data.loc[data['album'] == 'glory_sound_prep']['polarity']

trace0 = go.Box(
    y = y0,
    name = 'Translation Through Speakers',
    marker = dict(
        color = 'rgb(214, 12, 140)',
    )
)

trace1 = go.Box(
    y = y1,
    name = 'The Separation',
    marker = dict(
        color = 'rgb(0, 128, 128)',
    )
)

trace2 = go.Box(
    y = y2,
    name = 'The Definition',
    marker = dict(
        color = 'rgb(10, 140, 208)',
    )
)

trace3 = go.Box(
    y = y3,
    name = 'The Human Condition',
    marker = dict(
        color = 'rgb(12, 102, 14)',
    )
)

trace4 = go.Box(
    y = y4,
    name = 'Glory Sound Prep',
    marker = dict(
        color = 'rgb(100, 0, 10)',
    )
)

box_data = [trace0, trace1, trace2, trace3, trace4]

layout = go.Layout(
    title = 'Sentiment Polarity Box Plot for Jon Bellion Albums'
)

fig = go.Figure(data = box_data, layout = layout)
iplot(fig, filename = 'Sentiment Polarity Box Plot for Jon Bellion Albums')