# Exploratory Data Analysis
### Kwame V. Taylor

## Set up Environment

In [1]:
import pandas as pd
import numpy as np

from math import sqrt
from scipy import stats

import matplotlib.pyplot as plt
plt.style.use('ggplot')
import matplotlib as mpl
import seaborn as sns
from cycler import cycler

# default viz size settings
sns.set(rc={'figure.figsize':(12, 9)})
sns.set_context("talk", rc={"font.size":14,"axes.titlesize":16,"axes.labelsize":12}) 
plt.rc('figure', figsize=(12, 9))
plt.rc('font', size=14)
mpl.rcParams['font.size'] = 14
mpl.rcParams['figure.figsize'] = 12, 9
mpl.rcParams['lines.linewidth'] = 2
mpl.rcParams['lines.linestyle'] = '--'
mpl.rcParams['axes.prop_cycle'] = cycler(color=['deepskyblue', 'firebrick', 'darkseagreen', 'violet'])

In [2]:
from prepare import handle_nulls, fix_tempo
from preprocessing import spotify_split, scale_data

## Acquire data

In [3]:
df = pd.read_csv('full-playlist.csv', index_col=0)

In [4]:
df.head()

Unnamed: 0_level_0,artist,album,release_date,track_name,album_popularity,label,danceability,energy,key,loudness,...,disc_number,track_number,album_id,album_type,duration_seconds,duration_minutes,is_featured_artist,release_year,release_month,release_day
track_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
6mecZbKK3JDeMdFRNxsCV5,tay-k,trapman,2020-07-12,trapman,36,Tay-K,0.792,0.594,2,-8.544,...,1,1,2J1hMj78HfdcMrmL2Sk6eR,single,232,3,0,2020,7,12
5PtMwNq8Dp31uYdGGacVJE,lil wyte,doubt me now,2003-03-04,oxy cotton,55,Hypnotize Minds Productions,0.816,0.578,9,-6.912,...,1,11,2lwxcemR1muymEHNMblCpm,album,193,3,0,2003,3,4
6s8EhlBn2PIoESylkXnwYc,kamelen,kingpin slim,2019-11-29,kingpin o.g - remix,46,NMG/G-HUSET,0.649,0.798,0,-6.45,...,1,11,6va2RTYO2ois7t88RN0LhJ,album,254,4,0,2019,11,29
2e9EZ2V5QGGZPMJacO3y0Y,waka flocka flame,flockaveli,2010-10-01,grove st. party (feat. kebo gotti),71,Asylum/Warner Records,0.705,0.702,0,-4.783,...,1,9,6MQtWELG7aRX7CkAzQ6nLM,album,250,4,1,2010,10,1
3ZRd5Z0fiYtASLdEPPb16m,project pat,mista don't play: everythangs workin',2001-02-13,don't save her (feat. crunchy black),55,Hypnotize Minds Productions,0.838,0.793,11,-5.47,...,1,5,4QzaueQPQa0lqrMmQoh4v0,album,261,4,1,2001,2,13


In [5]:
df.shape

(5733, 30)

In [6]:
df.columns

Index(['artist', 'album', 'release_date', 'track_name', 'album_popularity',
       'label', 'danceability', 'energy', 'key', 'loudness', 'mode',
       'speechiness', 'instrumentalness', 'liveness', 'valence', 'tempo',
       'duration_ms', 'time_signature', 'explicit', 'popularity',
       'disc_number', 'track_number', 'album_id', 'album_type',
       'duration_seconds', 'duration_minutes', 'is_featured_artist',
       'release_year', 'release_month', 'release_day'],
      dtype='object')

## Prepare data

In [7]:
# create 3 popularity bins for use in exploration
bin_labels_3 = ['low', 'moderate', 'high']
df['popularity_bin'] = pd.qcut(df['popularity'], q=3, precision=0, labels=bin_labels_3)
df.head()

Unnamed: 0_level_0,artist,album,release_date,track_name,album_popularity,label,danceability,energy,key,loudness,...,track_number,album_id,album_type,duration_seconds,duration_minutes,is_featured_artist,release_year,release_month,release_day,popularity_bin
track_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
6mecZbKK3JDeMdFRNxsCV5,tay-k,trapman,2020-07-12,trapman,36,Tay-K,0.792,0.594,2,-8.544,...,1,2J1hMj78HfdcMrmL2Sk6eR,single,232,3,0,2020,7,12,moderate
5PtMwNq8Dp31uYdGGacVJE,lil wyte,doubt me now,2003-03-04,oxy cotton,55,Hypnotize Minds Productions,0.816,0.578,9,-6.912,...,11,2lwxcemR1muymEHNMblCpm,album,193,3,0,2003,3,4,high
6s8EhlBn2PIoESylkXnwYc,kamelen,kingpin slim,2019-11-29,kingpin o.g - remix,46,NMG/G-HUSET,0.649,0.798,0,-6.45,...,11,6va2RTYO2ois7t88RN0LhJ,album,254,4,0,2019,11,29,low
2e9EZ2V5QGGZPMJacO3y0Y,waka flocka flame,flockaveli,2010-10-01,grove st. party (feat. kebo gotti),71,Asylum/Warner Records,0.705,0.702,0,-4.783,...,9,6MQtWELG7aRX7CkAzQ6nLM,album,250,4,1,2010,10,1,high
3ZRd5Z0fiYtASLdEPPb16m,project pat,mista don't play: everythangs workin',2001-02-13,don't save her (feat. crunchy black),55,Hypnotize Minds Productions,0.838,0.793,11,-5.47,...,5,4QzaueQPQa0lqrMmQoh4v0,album,261,4,1,2001,2,13,moderate


In [8]:
# rearrange columns for focused and easy viewing and exploration

# drop for now: 'album_id', 'album_popularity', 'release_year', 'release_month',
# 'release_day', 'duration_ms'

df = df[['popularity', 'popularity_bin', 'artist', 'album', 'track_name',
       'label', 'danceability', 'energy', 'key', 'loudness', 'mode', 'is_featured_artist',
       'speechiness', 'instrumentalness', 'liveness', 'valence', 'tempo', 'time_signature', 'explicit',
       'disc_number', 'duration_seconds', 'duration_minutes', 'track_number', 'album_type', 'release_date']]
df

Unnamed: 0_level_0,popularity,popularity_bin,artist,album,track_name,label,danceability,energy,key,loudness,...,valence,tempo,time_signature,explicit,disc_number,duration_seconds,duration_minutes,track_number,album_type,release_date
track_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
6mecZbKK3JDeMdFRNxsCV5,43,moderate,tay-k,trapman,trapman,Tay-K,0.792,0.594,2,-8.544,...,0.351,82.512,4,1,1,232,3,1,single,2020-07-12
5PtMwNq8Dp31uYdGGacVJE,61,high,lil wyte,doubt me now,oxy cotton,Hypnotize Minds Productions,0.816,0.578,9,-6.912,...,0.265,148.077,4,1,1,193,3,11,album,2003-03-04
6s8EhlBn2PIoESylkXnwYc,23,low,kamelen,kingpin slim,kingpin o.g - remix,NMG/G-HUSET,0.649,0.798,0,-6.450,...,0.717,160.011,4,1,1,254,4,11,album,2019-11-29
2e9EZ2V5QGGZPMJacO3y0Y,62,high,waka flocka flame,flockaveli,grove st. party (feat. kebo gotti),Asylum/Warner Records,0.705,0.702,0,-4.783,...,0.771,140.059,4,1,1,250,4,9,album,2010-10-01
3ZRd5Z0fiYtASLdEPPb16m,45,moderate,project pat,mista don't play: everythangs workin',don't save her (feat. crunchy black),Hypnotize Minds Productions,0.838,0.793,11,-5.470,...,0.800,160.003,4,1,1,261,4,5,album,2001-02-13
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5G9IXRzPaRJssJE76TDenY,33,moderate,tkay maidza,shook,shook,4AD,0.895,0.791,4,-4.581,...,0.607,106.048,4,1,1,162,2,1,single,2020-05-12
0g3zIOFTdkwgJdR48usYgY,61,high,beyoncé,the lion king: the gift [deluxe edition],my power,Parkwood Entertainment/Columbia,0.769,0.856,7,-4.923,...,0.755,126.950,4,0,1,260,4,12,album,2020-07-31
75Z53FMCvlupjHfAh9XgDo,52,high,shenseea,the sidechick song,the sidechick song,Attomatic Records - Romeich Major,0.736,0.615,8,-4.716,...,0.495,185.998,4,1,1,193,3,1,single,2020-03-09
7pXg4n5cOsZvMPsEtemNgz,48,moderate,sally sossa,100 flows,100 flows,LISTEN TO THE KIDS,0.804,0.679,2,-5.910,...,0.400,157.737,3,1,1,162,2,1,single,2020-10-20


In [9]:
# handle null values
df = handle_nulls(df)

# fix tempo
df = fix_tempo(df)

# split the data
X_train, y_train, X_validate, y_validate, X_test, y_test, train, validate, test = spotify_split(df, 'popularity')
train.head()

Shape of train: (4012, 24) | Shape of validate: (861, 24) | Shape of test: (860, 24)
Percent train: 70.0        | Percent validate: 15.0       | Percent test: 15.0


Unnamed: 0_level_0,popularity,popularity_bin,artist,album,track_name,label,danceability,energy,key,loudness,...,valence,tempo,time_signature,explicit,disc_number,duration_seconds,duration_minutes,track_number,album_type,release_date
track_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
30bqVoKjX479ab90a8Pafp,87,high,lil peep,star shopping,star shopping,Lil Peep,0.585,0.471,4,-9.934,...,0.323,93.099,4,1,1,142,2,1,single,2019-04-19
0HO8pCseEpgozNi3z0R4bc,24,low,father,who's gonna get fucked first?,everybody in the club gettin shot,Awful Records,0.833,0.518,10,-10.126,...,0.773,90.004,4,1,1,120,2,11,album,2015-03-17
643K3eEgRvdJiXjSzlz7dg,30,moderate,m.o.p.,first family 4 life,breakin' the rules,Relativity Records,0.471,0.671,1,-6.05,...,0.85,88.4315,4,1,1,252,4,2,album,1998-08-11
08EabdvPIHC4KBW7LfynPP,26,low,salt-n-pepa,very necessary,no one does it better,Mercury Records,0.874,0.507,6,-10.076,...,0.706,103.067,4,0,1,234,3,2,album,1993-10-12
6B2ZDVgJ0lHLgV9xC2PAJh,23,low,dungeon family,even in darkness,white gutz (feat. bubba sparxxx),Arista,0.857,0.808,6,-5.468,...,0.825,103.61,4,1,1,266,4,9,album,2001-10-22


In [10]:
train.columns

Index(['popularity', 'popularity_bin', 'artist', 'album', 'track_name',
       'label', 'danceability', 'energy', 'key', 'loudness', 'mode',
       'is_featured_artist', 'speechiness', 'instrumentalness', 'liveness',
       'valence', 'tempo', 'time_signature', 'explicit', 'disc_number',
       'duration_seconds', 'duration_minutes', 'track_number', 'album_type',
       'release_date'],
      dtype='object')

# Does a track being explicit or not correlate with its popularity?

### Swarm plot

In [None]:
sns.catplot(x="explicit", y="popularity", kind="swarm", data=train, height=8, aspect=1)

### Independent T-Test

The features are independent because there is no overlap between observations in explicit and observations in not explicit.

In [None]:
# check for normal distribution
sns.distplot(train.popularity)

In [None]:
train.explicit.value_counts()

In [None]:
# compare variances
explicit_sample = train[train.explicit==True].popularity
not_explicit_sample = train[train.explicit==False].popularity

print(explicit_sample.var())
print(not_explicit_sample.var())

No, so we will set the argument of equal_var to False.

**Set hypothesis and alpha:**

$H_{0}$: Mean of song popularity of explicit tracks = Mean of song popularity of non-explicit tracks

$H_{a}$: Mean of song popularity of explicit tracks > Mean of song popularity of non-explicit tracks

$\alpha$: .05

In [None]:
alpha = .05

**Compute test statistic and probability (t-statistic & p-value)**

In [None]:
t, p = stats.ttest_ind(explicit_sample, not_explicit_sample, equal_var = False)

print('Test statistic:', t, '\n', p/2, '\n', alpha)

**Decide**

In [None]:
null_hypothesis = "there is no significant difference between the mean popularity of explicit tracks and non-explicit tracks."

if p/2 < alpha:
    print("We reject the hypothesis that", null_hypothesis)
else:
    print("We fail to reject the null hypothesis.")

In [None]:
not_explicit_sample.mean(), explicit_sample.mean()

Now put it in a function:

In [None]:
def explicit_viz(df):
    '''
    This function produces a swarm plot on explicit tracks' and non-explicit tracks' popularity.
    '''
    print('Does a track being explicit or not correlate with its popularity?')
    sns.catplot(x="explicit", y="popularity", kind="swarm", data=df)

In [None]:
def explicit_ttest(df, alpha):
    print('Set the alpha/significance level:')
    print('  alpha =', alpha)
    
    print('\n---\n')
    
    print('Check for normal distribution:')
    sns.distplot(df.popularity)
    plt.show()
    
    print('---\n')
    
    print('Check values counts:')
    print(df.explicit.value_counts())
    
    print('\n---\n')
    
    print('Compare variances:')
    explicit_sample = df[df.explicit==True].popularity
    not_explicit_sample = df[df.explicit==False].popularity
    
    # if [results of lavenes variance test], then equal_var = __ (automate checking similar variance)
    print(explicit_sample.var())
    print(not_explicit_sample.var())
          
    print("They are of similar variance, so we will set the argument of equal_var to True.")
    
    print('\n---\n')
          
    print("Compute test statistic and probability (t-statistic & p-value)")
    t, p = stats.ttest_ind(explicit_sample, not_explicit_sample, equal_var = True)
    print('Test statistic:', t, '\np-value:', p/2, '\nalpha:', alpha)
    
    print('\n---\n')
    
    null_hypothesis = "there is no significant difference between the mean popularity of explicit tracks and non-explicit tracks."
    if p/2 < alpha:
        print("We reject the hypothesis that", null_hypothesis)
    else:
        print("We fail to reject the null hypothesis.")
        
    print('\n---\n')
          
    print('mean of non-explicit songs:', not_explicit_sample.mean(), '\nmean of explicit songs:', explicit_sample.mean())

In [None]:
explicit_ttest(train, alpha)

# Does loudness have a relationship with popularity?

In [None]:
plt.figure(figsize=(10,5))
plt.title('Popularity & Loudness')
plt.ylabel('Loudness')
plt.xlabel('Popularity')
sns.scatterplot(train['popularity'], train['loudness'])
plt.show()

Doesn't appear to be any valuable insights here.

# Are originals or remixes more popular?

In [None]:
train[train.track_name.str.contains('remix', case=True, flags=0, regex=False)]

In [None]:
train[train.track_name.str.contains('pop, lock', case=True, flags=0, regex=False)]

In [None]:
train.columns

There are not enough remixes and originals to compare.

# What are the drivers of popularity?

In [None]:
def corr_heatmap(train):
    '''
    This function creates a heatmap of the correlation of all features scaled, minus longitude and latitude and redundant features.
    Takes a dataFrame as an argument
    '''
    # heatmap time!
    heatmap_data = train
    corr = heatmap_data.corr()
    mask = np.triu(np.ones_like(corr, dtype=bool))
    ax = sns.heatmap(corr, mask=mask, center=0, vmin=0, vmax=1, cmap=sns.diverging_palette(95, 220, n=250, s=93, l=35), square=True) 
    ax.set_xticklabels(ax.get_xticklabels(), rotation=45, horizontalalignment='right')
    ax.set_yticklabels(ax.get_yticklabels(), rotation=0, horizontalalignment='right')
    plt.title('Which features have significant correlation?')
    ax

In [None]:
corr_heatmap(train.drop(columns=['artist', 'album', 'track_name', 'release_date']))

Looks like the top drivers of popularity across all of the tracks are danceability, instrumentalness, and explicit.

# Do instrumentals/lyrics have an effect on popularity?

In [None]:
train.instrumentalness.hist()

In [None]:
train.instrumentalness.value_counts()

In [None]:
train.instrumentalness.max()

In [None]:
train['instrumentalness_boolean'] = train["instrumentalness"] > 0.0
train.head()

In [None]:
sns.catplot(x="instrumentalness_boolean", y="popularity", hue="explicit", kind="swarm", data=train, height=8, aspect=1)

There doesn't appear to be a significant difference in popularity between songs with a 0.0 amount of instrumentalness, and songs that have over 0.0 amount of instrumentalness.

# Further feature engineering and exploration

In [None]:
train.head()

Below are the top ten record labels with the highest average of song popularity across the 6000~ trasks we have in the dataset right now. Of course, some of these labels may have a small number of observations in them. Matt has done further exploration with the labels in his notebook.

In [None]:
pd.DataFrame(train.groupby('label')['popularity'].mean().sort_values(ascending=False).head(10))

Will eventually need to use regex to seperate out each the labels (delimited by a ```/```).

In [None]:
# Grouped violinplot
sns.violinplot(x='popularity_bin', y="danceability", hue="mode", data=train, palette="Pastel1")
plt.show()

In [None]:
sns.lineplot(data=train, x="duration_minutes", y="popularity", hue="is_featured_artist", err_style="bars")
plt.title(label="How many minutes long are songs that are popular, grouped by songs with featured artists and without?", size=20)

Takeaways: 

## Creating is_top_billboard_label

Def Jam, Young Money, Roc-A-Fella, Jive, Bad Boy, Grand Hustle, Shady, Ruffhouse, Cash Money, Columbia

Reference: https://pudding.cool/2017/03/labels/

In [None]:
top_ten_billboard = ['Def Jam', 'Young Money', 'Roc-A-Fella', 'Jive', 'Bad Boy', 'Grand Hustle', 'Shady', 'Ruffhouse', 'Cash Money', 'Columbia']
print(top_ten_billboard)
print()

pattern = '|'.join(top_ten_billboard)

train['is_top_billboard_label'] = train.label.str.contains(pattern)

train['is_top_billboard_label'] = train.is_top_billboard_label.astype('int')

print(train.is_top_billboard_label.value_counts())

In [None]:
plt.figure(figsize=(12,10))
sns.barplot(x=train.is_top_billboard_label, y=train.popularity)
plt.title('Avg popularity of tracks produced by top ten hip hop labels (per Billboard performance) vs total avg', fontsize=20)

# Looking at musical profiles:

In [None]:
train.popularity_bin.value_counts()

In [None]:
train.tempo.hist(bins=30)
x_ticks = np.arange(0, 230, 10)
plt.xticks(x_ticks)
plt.show()

In [None]:
# put tempo in second position
second_col = train.pop("tempo")
train.insert(1, "tempo", second_col)
train.drop(train[train.tempo <= 170].index).sort_values(by="tempo", ascending=False)

In [None]:
# manually bin tempo with domain knowledge
bin_labels = ['0 to 90', '91 to 110', '111 to 130', '131 to 150', '150 to 170']
train['tempo_bins'] = pd.qcut(train['tempo'], q=5, precision=0, labels=bin_labels)
ax = sns.violinplot(x="tempo_bins", y="popularity", hue="mode",data=train)

(x, bins, right=True, labels=None, retbins=False, precision=3, include_lowest=False, duplicates='raise', ordered=True)[source]

In [None]:
bin_labels = ['0 to 90', '95 to 120', 'Moderate', 'Moderately High', 'High']
train['tempo_bins'] = pd.qcut(train['tempo'], q=5, precision=0, labels=bin_labels)
ax = sns.violinplot(x="tempo_bins", y="popularity", hue="mode",data=train)

(x, bins, right=True, labels=None, retbins=False, precision=3, include_lowest=False, duplicates='raise', ordered=True)[source]

In [None]:
df.sort_values(by='tempo', ascending=True).head(3)

In [None]:
train['valence_bins'] = pd.qcut(train['valence'], q=3, precision=0)
ax = sns.violinplot(x="valence_bins", y="popularity", hue="mode",data=train)

In [None]:
train.columns

In [None]:
ax = sns.stripplot(x="time_signature", y="popularity", data=train)

In [None]:
g = sns.catplot(x="time_signature", y="popularity",
                hue="mode", col="key",
                data=train, kind="strip",
                height=6, aspect=.7, col_wrap=3);

In [None]:
#train.groupby(['danceability', 'mode'])['popularity'].mean().unstack().plot()

In [None]:
train['danceability_bins'] = pd.qcut(train['danceability'], q=4, precision=0)

In [None]:
train.columns

In [None]:
# to explore in databricks and then come back here with the ones i like
audio = train[['popularity', 'is_featured_artist', 'explicit', 'popularity_bin',
              'is_top_billboard_label', 'danceability', 'energy', 'key', 'mode',
              'speechiness', 'valence', 'tempo', 'time_signature', 'duration_minutes',
              'album_type', 'tempo_bins', 'danceability_bins', 'valence_bins']]

In [None]:
audio.to_csv('audio_features.csv')

In [None]:
train.columns

In [None]:
sns.barplot(x="tempo_bins", y="popularity", hue="time_signature", data=train.drop(train[train.time_signature == 0].index))
plt.xlabel("Tempo", size=14)
plt.ylabel("Avg Popularity", size=14)
plt.show()

# Explore Genre

In [None]:
genre = pd.read_csv('genre_count_df.csv', index_col=0)
genre.head()

In [None]:
list(genre.columns)

### How many tracks of each genre?

In [None]:
for col in genre.iloc[ :, 31:417]:
    print(col + ': ', genre[col].sum())

In [None]:
# now I'll put it in pd df format.

genre_counts = pd.DataFrame([])
values = []
columns = []

for col in genre.iloc[ :, 31:417]:
    columns.append(col)

for col in genre.iloc[ :, 31:417]:
    values.append(genre[col].sum())

In [None]:
new_values = pd.Series(values, index=columns)
genre_counts = genre_counts.append(new_values, ignore_index=True)

genre_counts.head()

In [None]:
genre_counts.to_csv('genre_sums_df.csv')

In [None]:
genre_counts = genre_counts.T
genre_counts = genre_counts.rename(columns={0: "counts"})

genre_counts.head()

In [None]:
genre_counts.counts.hist()

In [None]:
pd.set_option('display.max_rows', 500)
pd.set_option('display.max_columns', 500)
pd.set_option('display.width', 1000)

genre_counts[genre_counts['counts'] >= 150].sort_values(by='counts', ascending=False)

In [None]:
# to explore in databricks and then come back here to reproduce the visualizations that i liked

# select only houston rap tracks
houston_rap = genre[genre['houston rap'] == 1]
# select only the features I will be using
houston_rap = houston_rap[['popularity', 'is_featured_artist', 'explicit',
              'key', 'mode', 'tempo', 'time_signature', 'duration_minutes',
              'tempo_bins', 'popularity_bin']]
# reminder to rename time_signature to beats per measure for sake of visuals/presentation

In [None]:
houston_rap