In [None]:
import numpy as np 
import pandas as pd 
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestClassifier
from sklearn.feature_selection import RFECV
from sklearn.model_selection import StratifiedKFold
from sklearn.linear_model import LogisticRegression
from boruta import BorutaPy

## Reading The Data 

In [None]:
datas = [pd.read_csv("/kaggle/input/the-spotify-hit-predictor-dataset/dataset-of-{}s.csv".format(decade)) for decade in ['60', '70', '80', '90', '00', '10']]

In [None]:
for i, decade in enumerate([1960, 1970, 1980, 1990, 2000, 2010]):
    datas[i]['decade'] = pd.Series(decade, index=datas[i].index)

data = pd.concat(datas, axis=0).sample(frac=1.0, random_state=1).reset_index(drop=True)

## Exploring the Data

In [None]:
data.head()

In [None]:
data.shape

In [None]:
data.info()

In [None]:
categorical_cols=data.columns[data.dtypes =='object']
print(categorical_cols)

In [None]:
data.nunique(axis=0)

In [None]:
data.isna().sum()

## Dropping Unnecessary Columns

In [None]:
df=data

In [None]:
df.drop(categorical_cols,axis=1,inplace=True)
df.columns

In [None]:
df.drop("decade",axis=1,inplace=True)
df.columns

## Train Test Split with Stratify

In [None]:
y=df.target

In [None]:
y.value_counts()

In [None]:
X=df.drop("target",axis=1)
X.columns

In [None]:
X_train,X_test,y_train,y_test=train_test_split(X,y,random_state=42,stratify=y)

In [None]:
y_train.value_counts()

In [None]:
y_test.value_counts()

## Scaling the Data

In [None]:
scaler = StandardScaler()
scaler.fit(X_train)
X_train = pd.DataFrame(scaler.transform(X_train), index=X_train.index, columns=X_train.columns)
X_test = pd.DataFrame(scaler.transform(X_test), index=X_test.index, columns=X_test.columns)    

In [None]:
X_train.head()

In [None]:
X_test.head()

## Feature Selection

## A. Feature Selection using Boruta Method

### 1. Train a Random Forest Model to be used by Boruta

* Since we already found a decent Random forest model with ```n_estimators=100, max_depth=100,max_leaf_nodes=500,min_samples_leaf=5,random_state=42,ccp_alpha=0.0004``` , we will use the same

In [None]:
rf=RandomForestClassifier(n_estimators=100, max_depth=100,max_leaf_nodes=500,min_samples_leaf=5,random_state=42,ccp_alpha=0.0004)
rf.fit(X_train,y_train)

In [None]:
print("Train Score: ",rf.score(X_train,y_train))
print("Test Score: ",rf.score(X_test,y_test), "\n")

### 1.1 Important Features Suggested by Random Forest

In [None]:
f_i = list(zip(X.columns,rf.feature_importances_))
f_i.sort(key = lambda X : X[1])
plt.barh([x[0] for x in f_i],[x[1] for x in f_i])
plt.show()

> ### Mode, Time_signature, tempo, liveness, chorus_hit, key are labelled as unnecessary by Random Forest

### 2. Boruta Feature Selection 

In [None]:
feat_selector = BorutaPy(rf, n_estimators='auto',verbose=2, random_state=42)

In [None]:
feat_selector.fit(X_train.to_numpy(), y_train.to_numpy())

In [None]:
print(feat_selector.support_)

In [None]:
print(feat_selector.ranking_) 

In [None]:
feature_ranks = list(zip(X_train.columns, 
                         feat_selector.ranking_, 
                         feat_selector.support_))


In [None]:
for feat in feature_ranks:
    print('Feature: {:<30} Rank: {},  Keep: {}'.format(feat[0], feat[1], feat[2]))

> ### key is labelled as unnecessary by Random Forest (Boruta)

## B. Feature Selection using RFECV and Logistic Regression

In [None]:
min_features_to_select = 1 
clf = LogisticRegression(C=0.1, max_iter=10000, penalty='l1', random_state=42,solver='saga')
cv = StratifiedKFold(5)

rfecv = RFECV(
    estimator=clf,
    step=1,
    cv=cv,
    scoring="accuracy",
    min_features_to_select=min_features_to_select,
    n_jobs=2,
)
rfecv.fit(X, y)
print(f"Optimal number of features: {rfecv.n_features_}")

In [None]:
n_scores = len(rfecv.cv_results_["mean_test_score"])
plt.figure()
plt.xlabel("Number of features selected")
plt.ylabel("Mean test accuracy")
plt.errorbar(
    range(min_features_to_select, n_scores + min_features_to_select),
    rfecv.cv_results_["mean_test_score"],
    yerr=rfecv.cv_results_["std_test_score"],
)
plt.title("Recursive Feature Elimination \nwith correlated features")
plt.show()

In [None]:
rfecv.support_

In [None]:
rfecv.ranking_

In [None]:
rfe_feature_ranks = list(zip(X_train.columns, 
                         rfecv.ranking_, 
                         rfecv.support_))

In [None]:
for feat in rfe_feature_ranks:
    print('Feature: {:<30} Rank: {},  Keep: {}'.format(feat[0], feat[1], feat[2]))

> ### Speechiness, Liveness, Tempo, Duration_ms are labelled as unnecessary by RFECV (Logistic Regression)

### Consolidating all the responses, We decide to go with Random Forest which labelled ```Mode, Time_signature, tempo, liveness, chorus_hit, key``` as unnecessary features

## Data Transformation and Saving

In [None]:
X_train_final=X_train.drop(["mode","time_signature","tempo","liveness","chorus_hit","key"],axis=1)

In [None]:
X_test_final=X_test.drop(["mode","time_signature","tempo","liveness","chorus_hit","key"],axis=1)

In [None]:
X_train_final.to_csv('X_train_selected.csv', index=False)

In [None]:
X_test_final.to_csv('X_test_selected.csv', index=False)

In [None]:
np.save('y_train.npy', y_train)

In [None]:
np.save('y_test.npy', y_test)

In [None]:
X_train.to_csv('X_train.csv', index=False)

In [None]:
X_test.to_csv('X_test.csv', index=False)

In [None]:
X.to_csv("X.csv",index=False)

In [None]:
np.save('y.npy',y)

In [None]:
y_test.shape

In [None]:
X_selected = pd.concat([X_train_final, X_test_final])
y_selected = np.concatenate((y_train,y_test), axis=0)

In [None]:
X_selected.to_csv('X_selected.csv', index=False)

In [None]:
np.save('y_selected.npy',y_selected)