In [4]:
import pandas as pd
train=pd.read_csv('../input/song-popularity-prediction/train.csv')

In [5]:
features=train.columns
features

In [6]:
train.head()

In [7]:
train.info()

In [8]:
#applying mean to replace null in duration of the song
import numpy as np
dur_mean=np.mean(train['song_duration_ms'])
#train['song_duration_ms'].fillna(dur_mean,inplace=True)

In [9]:
train_null=pd.DataFrame(train[train.isnull().any(axis=1)])
train_null.isna().sum(axis=1).max()

In [34]:
type(train_null)

In [10]:
train_null[train_null.isna().sum(axis=1)>=4]

In [11]:
#Dropping all the rows having null in 4 or more columns
null_idx=train[train.isna().sum(axis=1)>=4]
train.drop(null_idx.index,inplace=True)

In [52]:
train

In [14]:
#Tranforming columns intrumentalness
train['log_instrumentalness']=train['instrumentalness'].apply(lambda x: np.log(x))

In [69]:
#Imputing data using IterativeImputer
from sklearn.pipeline import Pipeline
from sklearn.impute import KNNImputer
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import StandardScaler

pipeline=Pipeline([
    ('scale',StandardScaler()),
    ('impute',KNNImputer(n_neighbors=7)),
    #('model',RandomForestClassifier(random_state=42))
])

In [1]:
#Creating Stratified folds for cross-validation 
from sklearn.model_selection import StratifiedShuffleSplit

ss=StratifiedShuffleSplit(n_splits=5,test_size=0.2,random_state=42)

In [2]:
ss

In [36]:
y_train=train['song_popularity']
X_train=pd.DataFrame(train)
X_train.drop(['id','instrumentalness','song_popularity'],axis=1,inplace=True)

In [40]:
#Creating model
from sklearn.ensemble import RandomForestClassifier

impute_clf=RandomForestClassifier(n_estimators=150,random_state=42)

In [68]:
#Fitting the pipeline on one split, modeling and cross validation accuracy
scores=[]
from sklearn.metrics import roc_auc_score
for train_idx,val_idx in ss.split(X_train,y_train):
    print(train_idx,val_idx)
    X_train_new,X_val=X_train.iloc[train_idx],X_train.iloc[val_idx]
    y_train_new,y_val=y_train.iloc[train_idx],y_train.iloc[val_idx]
    train_trans=pipeline.fit_transform(X_train_new)
    val_trans=pipeline.transform(X_val)
    impute_clf.fit(train_trans,y_train_new)
    val_pred=impute_clf.predict(val_trans)
    scores.append(roc_auc_score(y_val,val_pred))
    print(scores)

In [13]:
#cols_to_be_transformed
cols_to_be_transformed=[]
for cols in train:
    if train[cols].isna().any():
        cols_to_be_transformed.append(cols)
cols_to_be_transformed

In [53]:
train[cols_to_be_transformed]

In [57]:
train_trans=pipeline.fit_transform(train)

In [56]:
len(train_trans)

In [46]:
train_trans=pd.DataFrame(train_trans,columns=X_train.columns)

In [47]:
train_trans.info()

In [49]:
train_trans

In [15]:
pipeline.get_params().keys()

In [23]:
y_train=train['song_popularity']
X_train=pd.DataFrame(train)
X_train.drop(['id','instrumentalness','song_popularity'],axis=1,inplace=True)

In [25]:
#Feeding this model into GridSearchCV
from sklearn.model_selection import GridSearchCV
params={'impute__n_neighbors':[5,10],'model__n_estimators':[50,100]}

grid_pipe=GridSearchCV(pipeline,params,cv=5,verbose=10,scoring='roc_auc')
grid_pipe.fit(X_train,y_train)

In [28]:
#Setting up the pipeline with best features
from sklearn.pipeline import Pipeline
from sklearn.impute import KNNImputer
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import StandardScaler

pipeline=Pipeline([
    ('scale',StandardScaler()),
    ('impute',KNNImputer(n_neighbors=5)),
    ('model',RandomForestClassifier(n_estimators=100,random_state=42))
])
pipeline.fit_transform(X_train,y_train)

In [43]:
import seaborn as sns
import matplotlib.pyplot as plt
train.hist(figsize=(12,10))
plt.show()

EDA of Each Feature

In [50]:
# song_duration_ms plot
sns.set_theme()
#ax=sns.displot(data=train,x='song_duration_ms',kind='kde',color='red',height=4,aspect=3) 
ax=sns.displot(data=X_train_new,x='song_duration_ms',kind='hist',color='red',height=4,aspect=3)
ax=sns.displot(data=train_trans,x='song_duration_ms',kind='hist',color='red',height=4,aspect=3)
ax=sns.displot(data=train,x='song_duration_ms',kind='kde',height=4,aspect=3,hue='song_popularity') #Kernel Distribution Estimation

Hence from the above data we see that the songs in the range of 3mins are most popular

In [53]:
# acousticness plot
import numpy as np
ax=sns.displot(data=X_train_new,x='acousticness',kind='hist',color='orange',height=5,aspect=3)
ax=sns.displot(data=train_trans,x='acousticness',kind='hist',color='orange',height=5,aspect=3)
#ax=sns.displot(data=train,x=np.log(train['acousticness']),kind='kde',color='orange',height=5,aspect=3)
ax=sns.displot(data=train,x='acousticness',kind='kde',height=5,aspect=3,hue='song_popularity') 

Taking a log doesnt help much in improving the trend, so as of now we dont do a log transformation for this column

In [54]:
# danceability plot
ax=sns.displot(data=X_train_new,x='danceability',kind='hist',color='green',height=4,aspect=3)
ax=sns.displot(data=train_trans,x='danceability',kind='hist',color='green',height=4,aspect=3)
ax=sns.displot(data=train,x='danceability',kind='kde',height=4,aspect=3,hue='song_popularity')

In [55]:
# energy plot
ax=sns.displot(data=X_train_new,x='energy',kind='hist',color='purple',height=4,aspect=3)           #kind=kde for distribution plot
ax=sns.displot(data=train_trans,x='energy',kind='hist',color='purple',height=4,aspect=3)
ax=sns.displot(data=train,x='energy',kind='kde',height=4,aspect=3,hue='song_popularity') 

In [59]:
# instrumentalness plot

ax=sns.displot(data=train,x='instrumentalness',kind='kde',color='brown',height=4,aspect=3) 
ax=sns.displot(data=X_train_new,x='log_instrumentalness',kind='hist',color='brown',height=4,aspect=3) 
ax=sns.displot(data=train_trans,x='log_instrumentalness',kind='hist',color='brown',height=4,aspect=3)
ax=sns.displot(data=train,x='instrumentalness',kind='kde',height=4,aspect=3,hue='song_popularity') 

Using the log of the feature looks a reasonable outcome

In [61]:
# key plot
sns.displot(data=X_train_new,x='key',color='darkblue',height=4,aspect=3)
sns.displot(data=train_trans,x='key',color='darkblue',height=4,aspect=3)
ax=sns.displot(data=train,x='key',height=4,aspect=3,hue='song_popularity',multiple="stack") 

In [62]:
# liveness plot
ax=sns.displot(data=X_train_new,x='liveness',kind='hist',color='Magenta',height=4,aspect=3)
ax=sns.displot(data=train_trans,x='liveness',kind='hist',color='Magenta',height=4,aspect=3)
#ax=sns.displot(data=train,x=np.log(train['liveness']),kind='kde',color='Magenta',height=4,aspect=3)
ax=sns.displot(data=train,x='liveness',kind='kde',height=4,aspect=4,hue='song_popularity') 

Using the log of this feature as we get a good distribution using the latter

In [63]:
# loudness plot
ax=sns.displot(data=X_train_new,x='loudness',kind='hist',color='Crimson',height=4,aspect=3)
ax=sns.displot(data=train_trans,x='loudness',kind='hist',color='Crimson',height=4,aspect=3)
ax=sns.displot(data=train,x='loudness',kind='kde',height=4,aspect=4,hue='song_popularity') 

In [None]:
#audio mode
sns.displot(data=train,x='audio_mode',color='cyan',height=4,aspect=3) 
ax=sns.displot(data=train,x='audio_mode',height=4,aspect=3,hue='song_popularity',multiple="stack") 

In [None]:
#Speechiness 
ax=sns.displot(data=train,x='speechiness',kind='kde',color='lightGreen',height=4,aspect=3)
ax=sns.displot(data=train,x=np.log(train['speechiness']),kind='kde',color='lightGreen',height=4,aspect=3)
ax=sns.displot(data=train,x='speechiness',kind='kde',height=4,aspect=4,hue='song_popularity') 

Using log of this feature

In [None]:
#tempo
ax=sns.displot(data=train,x='tempo',kind='kde',color='Black',height=4,aspect=3) 
ax=sns.displot(data=train,x='tempo',kind='kde',height=4,aspect=4,hue='song_popularity') 

In [None]:
#Time signature
sns.displot(data=train,x='time_signature',color='DarkOrange',height=4,aspect=3) 
ax=sns.displot(data=train,x='time_signature',height=4,aspect=3,hue='song_popularity',multiple="stack")

In [None]:
#Audio Valence
ax=sns.displot(data=train,x='audio_valence',kind='kde',color='tan',height=4,aspect=3) 
ax=sns.displot(data=train,x='audio_valence',kind='kde',height=4,aspect=4,hue='song_popularity') 

In [None]:
#Song popularity
sns.displot(data=train,x='song_popularity',color='maroon',height=4,aspect=3) 

There is a class imbalance, though not hugely imbalanced. But there is some imbalance between the classes

In [None]:
train.info()

In [20]:
#converting the songs in mins
train['song_duration_ms']=train['song_duration_ms']/(1000*60)

In [21]:
train.rename(columns={'song_duration_ms':'song_duration_mins'},inplace=True)

In [13]:
#Tranforming columns intrumentalness, liveness, speechiness
train['log_instrumentalness']=train['instrumentalness'].apply(lambda x: np.log(x))
#train['log_liveness']=train['liveness'].apply(lambda x: np.log(x))
#train['log_speechiness']=train['speechiness'].apply(lambda x: np.log(x))

In [None]:
y_train=train['song_popularity']
X_train=pd.DataFrame(train)

In [None]:
X_train

In [None]:
X_train.drop(['id','instrumentalness','song_popularity'],axis=1,inplace=True)

In [None]:
X_train

In [None]:
import xgboost as xgb

xgb_clf=xgb.XGBClassifier(random_state=42,n_estimators=50,max_depth=10,learning_rate=0.38)
xgb_clf.fit(X_train[features_xgb],y_train)

In [None]:
sub=test[['id']]
sub['song_popularity']=knn.predict(X_test)
sub['song_popularity'].value_counts()

In [None]:
sub.to_csv('submission.csv')

In [None]:
train.head()

In [None]:
#Checking correlation of the columns - with the null values
fig, ax = plt.subplots(figsize=(20,10))
ax=sns.heatmap(X_train.corr(),annot=True,linewidths=.5)

In [None]:
features_xgb=['acousticness','energy','loudness','log_instrumentalness']  

In [None]:
train.describe()

In [None]:
train['key'].value_counts()

In [None]:
from sklearn.impute import SimpleImputer
imputer=SimpleImputer(missing_values=np.nan,strategy='mean')
cols_to_be_imputed_with_mean=['song_duration_mins','acousticness','danceability','energy','loudness','log_instrumentalness']
train[cols_to_be_imputed_with_mean]=imputer.fit_transform(train[cols_to_be_imputed_with_mean])

In [None]:
imputer.statistics_

In [None]:
#Imputing key with mode
impute_mode=SimpleImputer(missing_values=np.nan,strategy='most_frequent')
train['key']=impute_mode.fit_transform(train['key'].to_numpy().reshape(-1,1))

In [None]:
#Median imputer
impute_med=SimpleImputer(missing_values=np.nan,strategy='median')
cols_to_be_imputed_with_median=['instrumentalness','liveness']
train[cols_to_be_imputed_with_median]=impute_med.fit_transform(train[cols_to_be_imputed_with_median])

In [None]:
train.info()

In [None]:
fig, ax = plt.subplots(figsize=(20,10))
ax=sns.heatmap(train.corr(),annot=True,linewidths=.5)

In [None]:
#Sampling data
#count_of_non_popular_song=len(train[train['song_popularity']==0])
#popular_songs=train[train['song_popularity']==1]
#train_oversampled=popular_songs.sample(count_of_non_popular_song,replace=True)
#X_train_sampled=pd.concat([train_oversampled,train[train['song_popularity']==0]])

In [None]:
train

In [None]:
#Scaling the features 
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
pipeline=Pipeline([('std_scaler',StandardScaler())])

In [None]:
#Using columns 'acousticness','energy','loudness','audio_valence','danceability','log_instrumentalness' as per the insight received from the above heatmap
#features_set1=['acousticness','energy','loudness','audio_valence','log_intrumentalness']
y_train=train['song_popularity']
X_train=pd.DataFrame(train)
X_train.drop(['instrumentalness','song_popularity','id'],axis=1,inplace=True)

In [None]:
pipeline.fit_transform(X_train)

In [None]:
train.info()

In [None]:
from sklearn.neighbors import KNeighborsClassifier

knn=KNeighborsClassifier()

params={'weights':['distance'],'n_neighbors':[10,12,15]}

knn_grid=GridSearchCV(knn,params,verbose=10,cv=5,scoring='roc_auc')
knn_grid.fit(X_train,y_train)

In [None]:
X_train

In [None]:
sns.scatterplot(data=train, x="energy", y="loudness", hue="song_popularity")

In [None]:
from imblearn.over_sampling import SMOTE
from imblearn.under_sampling import RandomUnderSampler
from imblearn.pipeline import Pipeline
from sklearn.ensemble import RandomForestClassifier

model = RandomForestClassifier(random_state=42)
over = SMOTE()
under = RandomUnderSampler()
steps = [('o', over), ('u', under),('model', model)]
pipeline_smote = Pipeline(steps=steps)

In [None]:
from sklearn.model_selection import GridSearchCV
params={'model__n_estimators':[50,100,150],'model__max_features':[4,6]}

smote_cv=GridSearchCV(estimator=pipeline_smote,param_grid=params,cv=5,scoring='roc_auc',verbose=10)
smote_cv.fit(X_train,y_train)

In [None]:
best_smote=smote_cv.best_estimator_.fit(X_train,y_train)

In [None]:
smote_cv.get_params().keys()

In [None]:
model.get_params()

In [None]:
X_train,y_train=pipeline_smote.fit_resample(X_train,y_train)

In [None]:
(y_train==0).count()

In [None]:
svc_clf.get_params()

In [None]:
#Creating a validation set
from sklearn.model_selection import GridSearchCV
from sklearn.svm import LinearSVC

svc_clf=LinearSVC(random_state=42)
param_svc={'C':[5,10],'tol':[0.005,0.05]}

svc_grid=GridSearchCV(svc_clf,param_svc,verbose=10,cv=5,scoring='roc_auc')
svc_grid.fit(X_train,y_train)

In [None]:
svc_grid.best_estimator_

In [None]:
#Building a model with best estimator
svc_final=svc_grid.best_estimator_
svc_final.fit(X_train,y_train)

In [None]:
test=pd.read_csv('../input/song-popularity-prediction/test.csv')

In [None]:
#Tweaking the time data 
test['song_duration_ms']=test['song_duration_ms']/1000*60
test.rename(columns={'song_duration_ms':'song_duration_mins'},inplace=True)

In [None]:
#Creating log of feature instrumentalness
test['log_instrumentalness']=test['instrumentalness'].apply(lambda x: np.log(x))
#test['log_liveness']=test['liveness'].apply(lambda x: np.log(x))
#test['log_speechiness']=test['speechiness'].apply(lambda x: np.log(x))

In [None]:
X_test=pd.DataFrame(test)
X_test

In [None]:
X_test.drop(['id','instrumentalness'],inplace=True,axis=1)

In [None]:
X_test

In [None]:
#imputing the test data
test[cols_to_be_imputed_with_mean]=imputer.transform(test[cols_to_be_imputed_with_mean])
test['key']=impute_mode.transform(test['key'].to_numpy().reshape(-1,1))
test[cols_to_be_imputed_with_median]=impute_med.transform(test[cols_to_be_imputed_with_median])

In [None]:
X_test=pipeline.transform(X_test)

In [None]:
sub['song_popularity']=svc_final.predict(test)

In [None]:
sub['song_popularity'].value_counts()

In [None]:
ens1.get_params()

In [None]:
#Running RandomForestRegressor on the whole dataset
X_train=train
X_train=pipeline.fit_transform(X_train)

In [None]:
X_train

In [None]:
from sklearn.ensemble import RandomForestClassifier

ens1=RandomForestClassifier(random_state=42)
param_grid={'n_estimators':[50,100,200],'max_features': [4,6,8]}

#grid_ens=GridSearchCV(ens1,param_grid,verbose=10,scoring='roc_auc',cv=10)
#grid_ens.fit(X_train,y_train)

In [None]:
ens1.get_params()

In [None]:
grid_ens.best_estimator_

In [None]:
ens_final=RandomForestClassifier(max_features=6, random_state=42,n_estimators=50)
ens_final.fit(X_train,y_train)

In [None]:
sub['song_popularity']=ens_final.predict(X_test)

In [None]:
sub['song_popularity'].value_counts()

In [None]:
y_val_pred=ens1.predict(X_val)

In [None]:
#accuracy score
from sklearn.metrics import accuracy_score
accuracy_score(y_val,y_val_pred)

In [None]:
test.info()

In [None]:
test.drop(['instrumentalness','liveness','speechiness'],axis=1,inplace=True)

In [None]:
test

In [None]:
X_test=pipeline.transform(test)

In [None]:
X_test

In [None]:
sub=test[['id']]

In [None]:
sub['song_popularity']=ens_final.predict(X_test)

In [None]:
sub['song_popularity'].value_counts()   #50

In [None]:
sub.to_csv('submission.csv')

In [None]:
y_train=X_train_sampled['song_popularity']
#X_train_sampled.drop(['song_popularity'],axis=1,inplace=True)

In [None]:
#X_train_sampled.drop(['instrumentalness','liveness','speechiness'],axis=1,inplace=True)

In [None]:
#X_train_sampled.head()

In [None]:
#from sklearn.ensemble import RandomForestClassifier

#ens1=RandomForestClassifier(random_state=42)
#param_grid={'n_estimators':[50,100,200],'max_features': [4,6,8]}

#grid_ens=GridSearchCV(ens1,param_grid,verbose=10,scoring='roc_auc',cv=5)
#grid_ens.fit(X_train_sampled,y_train)

In [None]:
from sklearn.neural_network import MLPClassifier

mlp_clf = MLPClassifier(random_state=42,max_iter=300)
mlp_clf.fit(X_train,y_train)

In [None]:
sub['song_popularity']=final_tree.predict(X_test)

In [None]:
sub['song_popularity'].value_counts()

In [None]:
from sklearn.ensemble import ExtraTreesClassifier

ens1=ExtraTreesClassifier(random_state=42,class_weight='balanced')
param_grid={'n_estimators':[50,100,200],'max_features': [4,6,8]}

grid_ens=GridSearchCV(ens1,param_grid,verbose=10,scoring='roc_auc',cv=5)
grid_ens.fit(X_train,y_train)

In [None]:
grid_ens.best_estimator_

In [None]:
final_tree=ExtraTreesClassifier(max_features=6, n_estimators=50,random_state=42)
final_tree.fit(X_train,y_train)

In [None]:
#Stacking Ensemble
from sklearn.ensemble import ExtraTreesClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.ensemble import RandomForestClassifier
random_forest_clf = RandomForestClassifier(n_estimators=50, random_state=42)
extra_trees_clf = ExtraTreesClassifier(n_estimators=50, random_state=42)
mlp_clf = MLPClassifier(random_state=42,max_iter=300)

In [None]:
#Splitting the training data in train and val
from sklearn.model_selection import train_test_split

X_train,X_val,y_train,y_val=train_test_split(X_train,y_train,test_size=0.2)

In [None]:
estimators = [random_forest_clf, extra_trees_clf, mlp_clf]
for estimator in estimators:
    print("Training the", estimator)
    estimator.fit(X_train[features_xgb], y_train)

In [None]:
#Predict the val output using each estimator and storing the prediction to fit a stack on the validation set
X_val_predictions = np.empty((len(X_val), len(estimators)), dtype=np.float32)

for index, estimator in enumerate(estimators):
    X_val_predictions[:, index] = estimator.predict(X_val)

In [None]:
rnf_blender=RandomForestClassifier(n_estimators=200,oob_score=True, random_state=42)
rnf_blender.fit(X_val_predictions,y_val)

In [None]:
X_test_predictions = np.empty((len(test), len(estimators)), dtype=np.float32)

for index, estimator in enumerate(estimators):
    X_test_predictions[:, index] = estimator.predict(test)

In [None]:
sub['song_popularity']=rnf_blender.predict(X_test_predictions)

In [None]:
sub['song_popularity'].value_counts()