## CS 155 MP 1
Nora Xiao, Sani Deshmukh, Emily Xu, and Ashiria Goel

### Import data and libraries

In [4]:
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline
import pandas as pd
import gdown

from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.metrics import roc_auc_score
from sklearn.model_selection import train_test_split, GridSearchCV

from sklearn.ensemble import RandomForestClassifier

In [5]:
# Load data

train_path = 'https://drive.google.com/uc?id=1-skT_odY3h6OSAt0SDzzj3UhrZ002SRh'
test_path = 'https://drive.google.com/uc?id=102z70Beh6Q26yqSqo0TR3fgADC-b3Lpe'

gdown.download(train_path, 'train.csv', quiet=False)
gdown.download(test_path, 'test.csv', quiet=False)

train = pd.read_csv('train.csv')
test = pd.read_csv('test.csv')

Downloading...
From: https://drive.google.com/uc?id=1-skT_odY3h6OSAt0SDzzj3UhrZ002SRh
To: /content/train.csv
100%|██████████| 1.13M/1.13M [00:00<00:00, 54.5MB/s]
Downloading...
From: https://drive.google.com/uc?id=102z70Beh6Q26yqSqo0TR3fgADC-b3Lpe
To: /content/test.csv
100%|██████████| 259k/259k [00:00<00:00, 27.3MB/s]


### Data processing

In [6]:
print('Train shape:', train.shape)
print('Test shape:', test.shape)

Train shape: (3864, 20)
Test shape: (967, 19)


In [7]:
train.head()

Unnamed: 0,time_signature,speechiness,danceability,duration_ms,energy,track_href,mode,uri,type,track_album_release_date,analysis_url,id,instrumentalness,valence,key,tempo,loudness,acousticness,liveness,Popularity_Type
0,4.0,0.204,0.882,140733.0,0.764,https://api.spotify.com/v1/tracks/7iabz12vAuVQ...,1.0,spotify:track:7iabz12vAuVQYyekFIWJxD,audio_features,2024-05-23,https://api.spotify.com/v1/audio-analysis/7iab...,7iabz12vAuVQYyekFIWJxD,0.0,0.886,11.0,140.113,-5.241,0.359,0.119,High
1,4.0,0.159,0.779,246960.0,0.64,https://api.spotify.com/v1/tracks/4TsmezEQVSZN...,1.0,spotify:track:4TsmezEQVSZNNPv5RJ65Ov,audio_features,2005-08-29,https://api.spotify.com/v1/audio-analysis/4Tsm...,4TsmezEQVSZNNPv5RJ65Ov,0.000766,0.499,7.0,99.017,-8.415,0.000155,0.101,High
2,4.0,0.223,0.77,189707.0,0.597,https://api.spotify.com/v1/tracks/1AtFSBJibfaq...,1.0,spotify:track:1AtFSBJibfaqfiOByQCwZ5,audio_features,2024-06-21,https://api.spotify.com/v1/audio-analysis/1AtF...,1AtFSBJibfaqfiOByQCwZ5,0.0,0.875,1.0,170.022,-4.901,0.53,0.239,High
3,5.0,0.321,0.573,172296.0,0.693,https://api.spotify.com/v1/tracks/18Crh1Nd55lR...,1.0,spotify:track:18Crh1Nd55lRX4MVoJegO1,audio_features,2024-11-08,https://api.spotify.com/v1/audio-analysis/18Cr...,18Crh1Nd55lRX4MVoJegO1,0.00424,0.837,6.0,150.85,-6.22,0.609,0.196,Low
4,4.0,0.0315,0.714,274488.0,0.72,https://api.spotify.com/v1/tracks/42Xxh6RlXeZU...,1.0,spotify:track:42Xxh6RlXeZUNtNfbJ6A3D,audio_features,2020-12-24,https://api.spotify.com/v1/audio-analysis/42Xx...,42Xxh6RlXeZUNtNfbJ6A3D,0.0,0.696,6.0,113.015,-6.751,0.0922,0.0742,Low


In [14]:
def process_data(data, train=True):
  '''
  Data processing pipeline.
  Input: Dataframe
  Output: X, y if train=True, just X if train=False
  '''
  if (not train):
    data = data.drop(['track_href', 'uri', 'type', 'analysis_url', 'ID'], axis=1)
  else:
    data = data.drop(['track_href', 'uri', 'type', 'analysis_url', 'id'], axis=1)

  data = data.dropna()

  target = 'Popularity_Type'
  features = [col for col in data.columns if col != target]

  # Convert release date into just release year
  data['release_year'] = data['track_album_release_date'].str.extract('(\d{4})', expand=False).astype(float)
  data = data.drop('track_album_release_date', axis=1)

  # Encode popularity type as high=1, low=0
  if train:
    data[target] = data[target].replace({'High': 1, 'Low': 0})
    return data.drop(target, axis=1), data[target]
  return data

In [15]:
X, y = process_data(train)
X.head()


X_test = process_data(test, False)
X_test.head()

  data[target] = data[target].replace({'High': 1, 'Low': 0})


Unnamed: 0,time_signature,speechiness,danceability,duration_ms,energy,mode,instrumentalness,valence,key,tempo,loudness,acousticness,liveness,release_year
0,4.0,0.258,0.584,180638.0,0.747,0.0,0.0013,0.798,10.0,98.111,-4.726,0.105,0.149,2023.0
1,4.0,0.0422,0.816,158978.0,0.532,0.0,0.0,0.258,4.0,136.882,-5.634,0.218,0.25,2024.0
2,4.0,0.115,0.849,139041.0,0.584,0.0,6e-06,0.724,1.0,115.018,-8.195,0.0929,0.492,2022.0
3,4.0,0.366,0.584,253533.0,0.706,0.0,0.0,0.579,10.0,174.039,-7.899,0.209,0.125,1995.0
4,4.0,0.0281,0.429,266773.0,0.661,1.0,0.000121,0.285,11.0,173.372,-7.227,0.00239,0.234,2000.0


In [10]:
print('Features:', X.columns.tolist())

Features: ['time_signature', 'speechiness', 'danceability', 'duration_ms', 'energy', 'mode', 'instrumentalness', 'valence', 'key', 'tempo', 'loudness', 'acousticness', 'liveness', 'release_year']


In [16]:
'''
- duration_ms: no popular songs > ~0.6. KEEP.
- danceability: no popular songs < ~0.1. KEEP.
- time_signature: more plots, may not be correlated.
- speechiness: more plots, may not be correlated.
- energy: less popularity < a certain point. KEEP.
- mode: more plots, may not be correlated.
- track_album_release_date: probably not correlated.
- instrumentalness: possibly less popular songs with high instrumentalness, need to look more.
- valence: idk
- key: idk
- tempo: KEEP. slow tempo = less pop.
- loudness: KEEP. low loudness = less pop.
- acousticness: idk
- liveness: idk
-
'''

'\n- duration_ms: no popular songs > ~0.6. KEEP.\n- danceability: no popular songs < ~0.1. KEEP.\n- time_signature: more plots, may not be correlated.\n- speechiness: more plots, may not be correlated.\n- energy: less popularity < a certain point. KEEP.\n- mode: more plots, may not be correlated.\n- track_album_release_date: probably not correlated.\n- instrumentalness: possibly less popular songs with high instrumentalness, need to look more.\n- valence: idk\n- key: idk\n- tempo: KEEP. slow tempo = less pop.\n- loudness: KEEP. low loudness = less pop.\n- acousticness: idk\n- liveness: idk\n-\n'

### Training

In [17]:
rf_model = RandomForestClassifier(n_estimators= 100, criterion='gini')
rf_model.fit(X, y)
y_pred = rf_model.predict(X_test)


In [18]:
# prompt: show the predictions y_pred along with song name

# Assuming X_test is a DataFrame and has a column named 'track_name'
# Replace 'track_name' if your column has a different name
predictions_with_names = pd.DataFrame({'song_name': test['track_name'], 'prediction': y_pred})
predictions_with_names


KeyError: 'track_name'