# 50 Years of Music Trends

## Objective
* Analyze lyrics from billboard top 100 songs over 50 years to identify trends
* Statement: Has the sentiments of popular lyrics changed over time?

## Hypothesis
* Ha = the sentiments of popular lyrics has become more negative over time
* Ho = no change in the sentiments of popular lyrics over time 

## Sources
* musixmatch source: https://developer.musixmatch.com/documentation/api-reference/track-lyrics-get <br/>
* musixmatch python: https://github.com/hudsonbrendon/python-musixmatch <br/>
* billboard python: https://github.com/guoguo12/billboard-charts<br/>

In [1]:
# Dependency library
import numpy as np
import pandas as pd
import random
from datetime import datetime, timedelta
import matplotlib.pyplot as plt
from scipy.stats import linregress

# API Calls
import billboard
from musixmatch import Musixmatch

# API Keys
from musixmatch_api import api_key

# Import and Initialize Sentiment Analyzer
from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer
analyzer = SentimentIntensityAnalyzer()

In [2]:
# Generate a (pseudo) random list of (almost all) dates in string format to fit musixmatch parameter

date_list = []
min_year = 1968
error_counter = 0

for i in range(50):
    try:
        # RANDOM date generation
        month_rand = str(random.randint(1,12))
        if len(month_rand) ==1:
            # PAD single digit numbers with a leading 0
            month_rand = month_rand.zfill(2)
            
        day_rand = str(random.randint(1,28))
        if len(day_rand) == 1:
            day_rand = day_rand.zfill(2)
        
        # STRINGIFY the result for the musixmatch parameter
        date_rand = str(f'{min_year}-{month_rand}-{day_rand}')
        
        # APPEND result to date_list
        date_list.append(date_rand)
        
        #INCREMENT the year
        min_year = min_year + 1
        
    except ValueError:
        error_counter = error_counter + 1

# VIEW date_list object
date_list

['1968-10-05',
 '1969-01-27',
 '1970-03-26',
 '1971-06-10',
 '1972-04-20',
 '1973-09-21',
 '1974-03-28',
 '1975-11-22',
 '1976-04-01',
 '1977-08-17',
 '1978-05-08',
 '1979-03-22',
 '1980-09-11',
 '1981-11-09',
 '1982-05-22',
 '1983-09-14',
 '1984-11-08',
 '1985-08-28',
 '1986-01-06',
 '1987-11-18',
 '1988-10-07',
 '1989-03-22',
 '1990-11-19',
 '1991-12-08',
 '1992-04-20',
 '1993-07-11',
 '1994-01-09',
 '1995-01-26',
 '1996-02-11',
 '1997-12-07',
 '1998-01-01',
 '1999-01-02',
 '2000-12-24',
 '2001-02-18',
 '2002-06-21',
 '2003-02-09',
 '2004-04-24',
 '2005-03-26',
 '2006-12-02',
 '2007-02-28',
 '2008-10-02',
 '2009-12-09',
 '2010-07-24',
 '2011-10-12',
 '2012-12-02',
 '2013-08-03',
 '2014-07-20',
 '2015-10-26',
 '2016-09-07',
 '2017-02-23']

In [3]:
# Return top 100 billboard songs for each date in random list generated above
# Note: Running this code takes approximately 2 minutes

billboard_list = 'hot-100'
col_names = ['Song','Artist','Date']
chart_df = pd.DataFrame(columns = col_names)

for date in date_list:
    chart = billboard.ChartData(billboard_list,date)
    for row in chart:
        # EMPTY the list placeholder used to create the dataframe
        chart_list = []
        # CLEAN and convert the billboard object
        chart_list.append(str(row).replace("'","",1))
        temp_df = pd.DataFrame(chart_list)
        temp_df = temp_df[0].str.split("' by ",expand=True)
        temp_df = temp_df.rename(columns={0:"Song",1:"Artist"})
        temp_df['Date'] = date
        
        # APPEND the temp_df in the current loop location to the chart_df
        chart_df = chart_df.append(temp_df)

# REMOVE duplicates and RESET index from the resulting dataframe        
chart_df = chart_df.drop_duplicates().reset_index(drop=True)

# VIEW dataframe head
print(len(chart_df))
chart_df.head()

4999


Unnamed: 0,Song,Artist,Date
0,Hey Jude,The Beatles,1968-10-05
1,Harper Valley P.T.A.,Jeannie C. Riley,1968-10-05
2,Fire,The Crazy World Of Arthur Brown,1968-10-05
3,Little Green Apples,O.C. Smith,1968-10-05
4,Girl Watcher,The O'Kaysions,1968-10-05


In [4]:
# SPLIT the date values in the dataframe for plotting and analysis purposes

chart_df['Year'], chart_df['Month'], chart_df['Day'] = chart_df['Date'].str.split('-').str

# VIEW dataframe head
chart_df.head()

Unnamed: 0,Song,Artist,Date,Year,Month,Day
0,Hey Jude,The Beatles,1968-10-05,1968,10,5
1,Harper Valley P.T.A.,Jeannie C. Riley,1968-10-05,1968,10,5
2,Fire,The Crazy World Of Arthur Brown,1968-10-05,1968,10,5
3,Little Green Apples,O.C. Smith,1968-10-05,1968,10,5
4,Girl Watcher,The O'Kaysions,1968-10-05,1968,10,5


In [5]:
# Retrieve lyrics from MusixMatch API based on song and artist in above dataframe
# Running this code takes approximately 5 - 7 minutes

musixmatch = Musixmatch(api_key)
lyrics_list = []
error_counter = 0

# LOOP through the data frame and use song title and artist name to search for lyrics in musixmatch
for x in range(len(chart_df)):
    
    # ERROR HANDLING in case a song queries returns 'null' from musixmatch
    try:
        # GRAB the lyrics based on location (iloc) in chart_df
        song_search = chart_df.iloc[x,0]
        artist_search = chart_df.iloc[x,1]
        lyrics = musixmatch.matcher_lyrics_get(q_artist=artist_search,
                                               q_track=song_search)['message']['body']['lyrics']['lyrics_body']
        
        # FORMATTING to truncate the nonsense at the end of the lyrics from MusixMatch
        song_length = len(lyrics)
        endpoint = len("******* This Lyrics is NOT for Commercial use *******\n(1409617829201)")
        lyrics = lyrics.replace("\n", " ")
        lyrics = str(lyrics[:song_length-endpoint])
        
        # APPEND lyrics to lyrics_list
        lyrics_list.append(lyrics)
        
    except:
        error_counter = error_counter + 1
        lyrics_list.append('MUSIXMATCH_NA')

# CREATE new column in chart_df
chart_df['Lyrics'] = lyrics_list

# VIEW dataframe head
chart_df.head()

Unnamed: 0,Song,Artist,Date,Year,Month,Day,Lyrics
0,Hey Jude,The Beatles,1968-10-05,1968,10,5,MUSIXMATCH_NA
1,Harper Valley P.T.A.,Jeannie C. Riley,1968-10-05,1968,10,5,MUSIXMATCH_NA
2,Fire,The Crazy World Of Arthur Brown,1968-10-05,1968,10,5,MUSIXMATCH_NA
3,Little Green Apples,O.C. Smith,1968-10-05,1968,10,5,MUSIXMATCH_NA
4,Girl Watcher,The O'Kaysions,1968-10-05,1968,10,5,MUSIXMATCH_NA


In [6]:
# REMOVE blanks and errors from the dataframe
clean_chart_df = chart_df[(chart_df['Lyrics'] != "MUSIXMATCH_NA") & (chart_df['Lyrics'] != "")].reset_index(drop=True)
# VIEW dataframe head
clean_chart_df.head()
# clean_chart_df.count()

Unnamed: 0,Song,Artist,Date,Year,Month,Day,Lyrics


In [7]:
# Vader Sentiment Analysis conducted on each song in the dataframe

# INITIALIZE a list to hold the sentiments
lyrics_sentiments = []

# ANALYZE the list
for y in range(len(clean_chart_df)):
    results = analyzer.polarity_scores(clean_chart_df.iloc[y,6])
    compound = results["compound"]
    pos = results["pos"]
    neu = results["neu"]
    neg = results["neg"]
    lyrics_sentiments.append({"Compound": compound,
                              "Positive": pos,
                              "Negative": neg,
                              "Neutral": neu})

# CREATE a dataframe of sentiment analysis that will be appended to the chart_df
lyrics_sentiments_df = pd.DataFrame(lyrics_sentiments)    
    
# APPEND new columns containing the sentiment analysis    
clean_chart_df['Compound'] = lyrics_sentiments_df['Compound']
clean_chart_df['Positive'] = lyrics_sentiments_df['Positive']
clean_chart_df['Negative'] = lyrics_sentiments_df['Negative']
clean_chart_df['Neutral'] = lyrics_sentiments_df['Neutral']

# SAVE to a .csv output
clean_chart_df.to_csv('billboard_analysis.csv')

# VIEW resulting dataframe head
clean_chart_df.head()

KeyError: 'Compound'

In [None]:
# Create the pandas dataframe group in order to calculate averages by year

chart_group = clean_chart_df.groupby(["Year"]).mean()
chart_group_df = pd.DataFrame(chart_group).reset_index(drop=False)
chart_group_df

In [None]:
# TEST HISTOGRAM of negative sentiment results

plt.figure(figsize=(20,20))
plt.subplot(2, 1, 2)
plt.hist(chart_group_df['Negative'], 10, density=True, alpha=0.7, label="population1")
x,labels=plt.xticks()
labels=[0.03, 0.04, 0.05, 0.06, 0.07, 0.08, 0.09, 0.1 , 0.11]
plt.xticks(x,labels,fontsize=16)
plt.xticks(label=labels)
plt.yticks(fontsize=16)
plt.title("Distribution of Songs by Negative Sentiment",fontsize=20)

plt.savefig("histogram.png")

In [None]:
# Create a local x-axis
x_axis = np.arange(1968,2018)
print(len(x_axis))
print(len(chart_group_df['Negative']))

In [None]:
# Scatterplot of negative sentiment analysis

# Creates the regression line
(slope, intercept, r_value, p_value, std_err) = linregress(x_axis, chart_group_df['Negative'])
fit = slope * x_axis + intercept
r2 = r_value ** 2

# Sets up plot
fig, ax = plt.subplots(figsize=(20,10))
ax.set_xlabel("Years",fontsize=18)
ax.set_ylabel("Sentiment Analysis: Negative",fontsize=18)
ax.tick_params(labelsize=16)
ax.set_title(label="Negative Sentiment over Time",fontsize=24)

# Plots the data
ax.plot(x_axis, chart_group_df['Negative'], marker='o', color=('red'), linewidth=0.5)
ax.plot(x_axis, fit, 'b--')

plt.savefig("negative.png")

plt.show
print(f'r = {r_value}')
print(f'r^2 = {r2}')
print(f'std err = {std_err}')
print(f'p-value = {p_value}')
print('49% of the variation in the dependent variable (negative sentiment score) is accounted for by the variation \
in the independent variable (time in years). We feel comfortable using R^2, because the data is approximately \
normally distributed based on the shape of the histogram (slightly skewed right). Additionally, the p-value \
is 0+ which is < 0.05. \
Conclusion: We reject the null hypothesis (Ho) in favor of the alternative. These two variables are strongly related')

In [None]:
# Scatterplot of positive sentiment analysis

# Creates the regression line
(slope, intercept, r_value, p_value, std_err) = linregress(x_axis, chart_group_df['Positive'])
fit = slope * x_axis + intercept
r2 = r_value ** 2

# Sets up plot
fig, ax = plt.subplots(figsize=(20,10))
ax.set_xlabel("Years",fontsize=18)
ax.set_ylabel("Sentiment Analysis: Positive",fontsize=18)
ax.tick_params(labelsize=16)
ax.set_title(label="Positive Sentiment over Time",fontsize=24)

# Plots the data
ax.plot(x_axis, chart_group_df['Positive'], marker='o', color=('green'), linewidth=0.5)
ax.plot(x_axis, fit, 'b--')

plt.savefig("positive.png")

plt.show
print(f'r = {r_value}')
print(f'r^2 = {r2}')
print(f'std err = {std_err}')
print(f'p-value = {p_value}')

In [None]:
# Scatterplot of neutral sentiment analysis

# Creates the regression line
(slope, intercept, r_value, p_value, std_err) = linregress(x_axis, chart_group_df['Neutral'])
fit = slope * x_axis + intercept
r2 = r_value ** 2

# Sets up plot
fig, ax = plt.subplots(figsize=(20,10))
ax.set_xlabel("Years",fontsize=18)
ax.set_ylabel("Sentiment Analysis: Neutral",fontsize=18)
ax.tick_params(labelsize=16)
ax.set_title(label="Neutral Sentiment over Time",fontsize=24)

# Plots the data
ax.plot(x_axis, chart_group_df['Neutral'], marker='o', color=('blue'), linewidth=0.5)
ax.plot(x_axis, fit, 'b--')

plt.savefig("neutral.png")

plt.show
print(f'r = {r_value}')
print(f'r^2 = {r2}')
print(f'std err = {std_err}')
print(f'p-value = {p_value}')

In [None]:
# Scatterplot of compound sentiment analysis

# Creates the regression line
(slope, intercept, r_value, p_value, std_err) = linregress(x_axis, chart_group_df['Compound'])
fit = slope * x_axis + intercept
r2 = r_value ** 2

# Sets up plot
fig, ax = plt.subplots(figsize=(20,10))
ax.set_xlabel("Years",fontsize=18)
ax.set_ylabel("Sentiment Analysis: Compound",fontsize=18)
ax.tick_params(labelsize=16)
ax.set_title(label="Compound Sentiment over Time",fontsize=24)

# Plots the data
ax.plot(x_axis, chart_group_df['Compound'], marker='x', color=('black'), linewidth=0.5)
ax.plot(x_axis, fit, 'b--')

plt.savefig("compound.png")

plt.show
print(f'r = {r_value}')
print(f'r^2 = {r2}')
print(f'std err = {std_err}')
print(f'p-value = {p_value}')