In [1]:
import warnings
warnings.filterwarnings('ignore')

In [37]:
%matplotlib inline
import pandas as pd
from scipy.stats import f_oneway
from scipy.stats import ttest_ind
from scipy.stats import chi2_contingency

In [15]:
#create a reference to csv file
file_path = ("../Resources/Spotify_data.csv")
#Read csv file 
spotify_df = pd.read_csv(file_path)
#print first 5 rows
spotify_df.head()

Unnamed: 0.1,Unnamed: 0,acousticness,danceability,duration_ms,energy,instrumentalness,key,liveness,loudness,mode,speechiness,tempo,time_signature,valence,target,song_title,artist
0,0,0.0102,0.833,204600,0.434,0.0219,2,0.165,-8.795,1,0.431,150.062,4,0.286,1,Mask Off,Future
1,1,0.199,0.743,326933,0.359,0.00611,1,0.137,-10.401,1,0.0794,160.083,4,0.588,1,Redbone,Childish Gambino
2,2,0.0344,0.838,185707,0.412,0.000234,2,0.159,-7.148,1,0.289,75.044,4,0.173,1,Xanny Family,Future
3,3,0.604,0.494,199413,0.338,0.51,5,0.0922,-15.236,1,0.0261,86.468,4,0.23,1,Master Of None,Beach House
4,4,0.18,0.678,392893,0.561,0.512,5,0.439,-11.648,0,0.0694,174.004,4,0.904,1,Parallel Lines,Junior Boys


In [16]:
#remove rows with missing data
cleaned_rows_df = spotify_df.dropna(how="any")
cleaned_rows_df.count()

Unnamed: 0          2017
acousticness        2017
danceability        2017
duration_ms         2017
energy              2017
instrumentalness    2017
key                 2017
liveness            2017
loudness            2017
mode                2017
speechiness         2017
tempo               2017
time_signature      2017
valence             2017
target              2017
song_title          2017
artist              2017
dtype: int64

In [17]:
# #remove unwanted columns
reduced_df = cleaned_rows_df[['acousticness', 'danceability', 'duration_ms', 'energy',
       'instrumentalness', 'liveness', 'loudness', 'speechiness', 'tempo', 'valence', 'target',
       'song_title', 'artist']]
reduced_df.head()

Unnamed: 0,acousticness,danceability,duration_ms,energy,instrumentalness,liveness,loudness,speechiness,tempo,valence,target,song_title,artist
0,0.0102,0.833,204600,0.434,0.0219,0.165,-8.795,0.431,150.062,0.286,1,Mask Off,Future
1,0.199,0.743,326933,0.359,0.00611,0.137,-10.401,0.0794,160.083,0.588,1,Redbone,Childish Gambino
2,0.0344,0.838,185707,0.412,0.000234,0.159,-7.148,0.289,75.044,0.173,1,Xanny Family,Future
3,0.604,0.494,199413,0.338,0.51,0.0922,-15.236,0.0261,86.468,0.23,1,Master Of None,Beach House
4,0.18,0.678,392893,0.561,0.512,0.439,-11.648,0.0694,174.004,0.904,1,Parallel Lines,Junior Boys


In [18]:
#Identify duplicates
duplicates_df = reduced_df[reduced_df.duplicated()]
print("Duplicate Rows:\n", duplicates_df)

Duplicate Rows:
      acousticness  danceability  duration_ms  energy  instrumentalness  \
267      0.096200         0.654       252095   0.292          0.003100   
508      0.024600         0.586       374133   0.806          0.000167   
894      0.000334         0.907       183581   0.616          0.425000   
927      0.934000         0.440       254360   0.433          0.811000   
981      0.036900         0.448       258653   0.733          0.042500   

     liveness  loudness  speechiness    tempo  valence  target  \
267     0.106   -12.977       0.4740  153.547    0.111       1   
508     0.353    -9.542       0.0548  110.325    0.207       1   
894     0.110    -7.073       0.0577  124.036    0.614       1   
927     0.273   -17.453       0.0412   79.952    0.217       1   
981     0.169    -6.238       0.0292  142.008    0.404       1   

                        song_title         artist  
267                          River          Ibeyi  
508                    Her Fantasy   

In [19]:
# Drop duplicates
duplicates_cleaned_spotify = reduced_df.drop_duplicates()
duplicates_cleaned_spotify.head()

Unnamed: 0,acousticness,danceability,duration_ms,energy,instrumentalness,liveness,loudness,speechiness,tempo,valence,target,song_title,artist
0,0.0102,0.833,204600,0.434,0.0219,0.165,-8.795,0.431,150.062,0.286,1,Mask Off,Future
1,0.199,0.743,326933,0.359,0.00611,0.137,-10.401,0.0794,160.083,0.588,1,Redbone,Childish Gambino
2,0.0344,0.838,185707,0.412,0.000234,0.159,-7.148,0.289,75.044,0.173,1,Xanny Family,Future
3,0.604,0.494,199413,0.338,0.51,0.0922,-15.236,0.0261,86.468,0.23,1,Master Of None,Beach House
4,0.18,0.678,392893,0.561,0.512,0.439,-11.648,0.0694,174.004,0.904,1,Parallel Lines,Junior Boys


In [20]:
# Identify outliners
def identify_outliers(duplicates_cleaned_spotify, col):
    Q1 = duplicates_cleaned_spotify[col].quantile(0.25)
    Q3 = duplicates_cleaned_spotify[col].quantile(0.75)
    IQR = Q3 - Q1
    lower_bound = Q1 - 1.5 * IQR
    upper_bound = Q3 + 1.5 * IQR
    return duplicates_cleaned_spotify[(duplicates_cleaned_spotify[col] < lower_bound) | (duplicates_cleaned_spotify[col] > upper_bound)]

# Identify outliers for each numeric column
numeric_cols = duplicates_cleaned_spotify.select_dtypes(include=[np.number]).columns
outliers_dict = {col: identify_outliers(duplicates_cleaned_spotify, col) for col in numeric_cols}


In [21]:
# def remove_outliers(duplicates_cleaned_spotify, col):
#     Q1 = duplicates_cleaned_spotify[col].quantile(0.25)
#     Q3 = duplicates_cleaned_spotify[col].quantile(0.75)
#     IQR = Q3 - Q1
#     lower_bound = Q1 - 1.5 * IQR
#     upper_bound = Q3 + 1.5 * IQR
#     return duplicates_cleaned_spotify[(duplicates_cleaned_spotify[col] >= lower_bound) & (duplicates_cleaned_spotify[col] <= upper_bound)]

# # Remove outliers for each numeric column
# duplicates_cleaned_spotify = duplicates_cleaned_spotify.copy()
# for col in numeric_cols:
#     cleaned_spotify_complete = remove_outliers(duplicates_cleaned_spotify, col)


In [22]:
def replace_outliers_with_median(duplicates_cleaned_spotify, col):
    Q1 = duplicates_cleaned_spotify[col].quantile(0.25)
    Q3 = duplicates_cleaned_spotify[col].quantile(0.75)
    IQR = Q3 - Q1
    lower_bound = Q1 - 1.5 * IQR
    upper_bound = Q3 + 1.5 * IQR
    median_value = duplicates_cleaned_spotify[col].median()
    duplicates_cleaned_spotify[col] = np.where((duplicates_cleaned_spotify[col] < lower_bound) | (duplicates_cleaned_spotify[col] > upper_bound),
                       median_value, duplicates_cleaned_spotify[col])
    return duplicates_cleaned_spotify

# Replace outliers for each numeric column
duplicates_cleaned_spotify = duplicates_cleaned_spotify.copy()
for col in numeric_cols:
    cleaned_spotify_complete = replace_outliers_with_median(duplicates_cleaned_spotify, col)


In [23]:
#rename columns
renamed_spotify_complete = cleaned_spotify_complete.rename(columns={"song_title": "song", "artist": "singer"})
renamed_spotify_complete.head()

Unnamed: 0,acousticness,danceability,duration_ms,energy,instrumentalness,liveness,loudness,speechiness,tempo,valence,target,song,singer
0,0.0102,0.833,204600.0,0.434,0.0219,0.165,-8.795,0.0549,150.062,0.286,1.0,Mask Off,Future
1,0.199,0.743,326933.0,0.359,0.00611,0.137,-10.401,0.0794,160.083,0.588,1.0,Redbone,Childish Gambino
2,0.0344,0.838,185707.0,0.412,0.000234,0.159,-7.148,0.0549,75.044,0.173,1.0,Xanny Family,Future
3,0.604,0.494,199413.0,0.338,7.4e-05,0.0922,-6.2475,0.0261,86.468,0.23,1.0,Master Of None,Beach House
4,0.18,0.678,229120.0,0.561,7.4e-05,0.439,-11.648,0.0694,174.004,0.904,1.0,Parallel Lines,Junior Boys


In [24]:

# Create a dictionary from the DataFrame columns
data = {
    'song': renamed_spotify_complete['song'].tolist(),
    'acousticness': renamed_spotify_complete['acousticness'].tolist(),
    'danceability': renamed_spotify_complete['danceability'].tolist(),
    'energy': renamed_spotify_complete['energy'].tolist(),
    'loudness': renamed_spotify_complete['loudness'].tolist(),
    'speechiness':renamed_spotify_complete['speechiness'].tolist(),
    'liveness': renamed_spotify_complete['liveness'].tolist(),
    'valence': renamed_spotify_complete['valence'].tolist(),
    'tempo':renamed_spotify_complete['tempo'].tolist(),
    'duration_ms': renamed_spotify_complete['duration_ms'].tolist(),
    'instrumentalness': renamed_spotify_complete['instrumentalness'].tolist(),
    'target': renamed_spotify_complete['target'].tolist(),
    'singer': renamed_spotify_complete['singer'].tolist()
}

# Create DataFrame from the dictionary
spotify_complete = pd.DataFrame(data)
spotify_complete.head()

Unnamed: 0,song,acousticness,danceability,energy,loudness,speechiness,liveness,valence,tempo,duration_ms,instrumentalness,target,singer
0,Mask Off,0.0102,0.833,0.434,-8.795,0.0549,0.165,0.286,150.062,204600.0,0.0219,1.0,Future
1,Redbone,0.199,0.743,0.359,-10.401,0.0794,0.137,0.588,160.083,326933.0,0.00611,1.0,Childish Gambino
2,Xanny Family,0.0344,0.838,0.412,-7.148,0.0549,0.159,0.173,75.044,185707.0,0.000234,1.0,Future
3,Master Of None,0.604,0.494,0.338,-6.2475,0.0261,0.0922,0.23,86.468,199413.0,7.4e-05,1.0,Beach House
4,Parallel Lines,0.18,0.678,0.561,-11.648,0.0694,0.439,0.904,174.004,229120.0,7.4e-05,1.0,Junior Boys


# Hypotheses 1

**Null Hypothesis (H0):**

H0: Danceability does not affect an artist's popularity .

`**Alternative Hypothesis (Ha):**

Ha: Danceability affects an artist's popularity .

In [34]:
# Performing TTest
# Separate the danceability scores based on the target column
liked_songs = spotify_complete[spotify_complete['target'] == 1]['danceability']
unliked_songs = spotify_complete[spotify_complete['target'] == 0]['danceability']

# Perform the T-test
t_stat, p_val = ttest_ind(liked_songs, unliked_songs)

print(f"T-statistic: {t_stat}, P-value: {p_val}")

T-statistic: 7.767922630113768, P-value: 1.2621603705182227e-14


In [35]:
# Performing ANOVA
# Group by different levels of target (assuming binary in this case)
group1 = spotify_complete[spotify_complete['target'] == 1]['danceability']
group0 = spotify_complete[spotify_complete['target'] == 0]['danceability']

# Perform ANOVA
f_stat, p_val = f_oneway(group0, group1)

print(f"F-statistic: {f_stat}, P-value: {p_val}")


F-statistic: 60.34062198743364, P-value: 1.2621603705197201e-14


In [36]:


# Create a categorical column based on danceability level
spotify_complete['danceability_level'] = pd.cut(spotify_complete['danceability'], bins=3, labels=['Low', 'Medium', 'High'])

# Create a contingency table
contingency_table = pd.crosstab(spotify_complete['danceability_level'], spotify_complete['target'])

# Perform Chi-Square Test
chi2_stat, p_val, dof, expected = chi2_contingency(contingency_table)

print(f"Chi-Square Statistic: {chi2_stat}, P-value: {p_val}")


Chi-Square Statistic: 69.77041572522793, P-value: 7.072072841659867e-16


## Conclusion 1

**From all the results we have noticed that danceability does matter on making the singer pupular. Therefore the Alternative Hypothesis is true.**



# Hypotheses 2

**Null Hypothesis (H0):**

H0: The duration of a song does not affect the likeability of a song.

**Alternative Hypothesis (Ha):**

Ha: The duration of a song affects the likeability of a song.

In [40]:
# Perform T-test
# Separate the duration of songs based on the target column
liked_songs_duration = spotify_complete[spotify_complete['target'] == 1]['duration_ms']
unliked_songs_duration = spotify_complete[spotify_complete['target'] == 0]['duration_ms']

# Perform the T-test
t_stat, p_val = ttest_ind(liked_songs_duration, unliked_songs_duration)

print(f"T-statistic: {t_stat}, P-value: {p_val}")


T-statistic: 3.913593782114826, P-value: 9.39594299454354e-05


In [41]:
# Perform ANOVA
# Group by different levels of target 
group1 = spotify_complete[spotify_complete['target'] == 1]['duration_ms']
group0 = spotify_complete[spotify_complete['target'] == 0]['duration_ms']

# Perform ANOVA
f_stat, p_val = f_oneway(group0, group1)

print(f"F-statistic: {f_stat}, P-value: {p_val}")


F-statistic: 15.316216291407825, P-value: 9.395942994554528e-05


In [42]:
# Perfrom Chi-Square
# Create a categorical column for song duration
spotify_complete['duration_category'] = pd.cut(spotify_complete['duration_ms'], bins=3, labels=['Short', 'Medium', 'Long'])

# Create a contingency table
contingency_table = pd.crosstab(spotify_complete['duration_category'], spotify_complete['target'])

# Perform Chi-Square Test
chi2_stat, p_val, dof, expected = chi2_contingency(contingency_table)

print(f"Chi-Square Statistic: {chi2_stat}, P-value: {p_val}")


Chi-Square Statistic: 28.38613461098819, P-value: 6.855342981024413e-07


## Conclusion 2

**From all the results we have noticed that the longer the duration the chance for the song to be liked is high. Therefore the Alternative Hypothesis is true.**


# Hypotheses 3

**Null Hypothesis (H0):**

H0: The tempo of a song does not affect its likeability.

**Alternative Hypothesis (Ha):**

Ha: The tempo of a song affects its likeability.

In [43]:
# Perform T-test

 # Separate the tempo of songs based on the target column
liked_songs_tempo = spotify_complete[spotify_complete['target'] == 1]['tempo']
unliked_songs_tempo = spotify_complete[spotify_complete['target'] == 0]['tempo']

# Perform the T-test
t_stat, p_val = ttest_ind(liked_songs_tempo, unliked_songs_tempo)

print(f"T-statistic: {t_stat}, P-value: {p_val}")


T-statistic: 1.5238331636472835, P-value: 0.1277077038973061


In [44]:
# Perform ANOVA
# Group by different levels of target (assuming binary in this case)
group1 = spotify_complete[spotify_complete['target'] == 1]['tempo']
group0 = spotify_complete[spotify_complete['target'] == 0]['tempo']

# Perform ANOVA
f_stat, p_val = f_oneway(group0, group1)

print(f"F-statistic: {f_stat}, P-value: {p_val}")


F-statistic: 2.3220675106313187, P-value: 0.1277077038974697


In [45]:
# Perform Chi-Square
# Create a categorical column for song tempo
spotify_complete['tempo_category'] = pd.cut(spotify_complete['tempo'], bins=3, labels=['Slow', 'Medium', 'Fast'])

# Create a contingency table
contingency_table = pd.crosstab(spotify_complete['tempo_category'], spotify_complete['target'])

# Perform Chi-Square Test
chi2_stat, p_val, dof, expected = chi2_contingency(contingency_table)

print(f"Chi-Square Statistic: {chi2_stat}, P-value: {p_val}")


Chi-Square Statistic: 23.6097501859701, P-value: 7.468061531344004e-06


# Conclusion 3

**From Ttest and ANOVA results we have noticed that tempo does not have any effect on the likeability of a song. Therefore the Null Hypothesis is true.**