In [133]:
import numpy as np
import pandas as pd
import pickle
from sklearn import preprocessing
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split, GridSearchCV, StratifiedKFold
from sklearn.metrics import roc_curve, precision_recall_curve, auc, make_scorer, recall_score, accuracy_score, precision_score, confusion_matrix


In [134]:
with open('song_df_aggregate.pkl', 'rb') as f:
        df = pickle.load(f)

In [135]:
   #Put the title as dependent variable

In [136]:
dataset=df[df.date<"2019-10-05"][df.firstrank>10]

  """Entry point for launching an IPython kernel.


In [137]:
nan_col=["spotify_duration_ms","spotify_track_number","spotify_danceability","spotify_energy","spotify_loudness",
            "spotify_mode","spotify_speechiness","spotify_acousticness","spotify_instrumentalness",
           "spotify_liveness","spotify_valence","spotify_tempo","spotify_time_signature"]

dataset=dataset.dropna(axis=0,subset=nan_col)

In [138]:
dataset=dataset.drop(["release_year","label_category","artist","title","spotify_album_id",
                      "spotify_album_release_date_precision","album_label","datetime","join",
                      "spotify_id","spotify_disc_number","spotify_album_release_date","spotify_key",
                      "top50","top25","top75","top5","spotify_album_release_date_datetime",
                      "first_date","datetime","isnew","rank","release_month","last_award_type","label_appearance_count","label_appearance_count_group","numberofappearances"],axis=1)

In [139]:
dataset.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 7889 entries, 174 to 8518
Data columns (total 28 columns):
date                        7889 non-null object
spotify_explicit            7889 non-null object
spotify_duration_ms         7889 non-null float64
spotify_track_number        7889 non-null float64
spotify_danceability        7889 non-null float64
spotify_energy              7889 non-null float64
spotify_loudness            7889 non-null float64
spotify_mode                7889 non-null float64
spotify_speechiness         7889 non-null float64
spotify_acousticness        7889 non-null float64
spotify_instrumentalness    7889 non-null float64
spotify_liveness            7889 non-null float64
spotify_valence             7889 non-null float64
spotify_tempo               7889 non-null float64
spotify_time_signature      7889 non-null float64
num_artists                 7889 non-null float64
award_num                   7889 non-null float64
gold_count                  7889 non-null f

In [140]:
dataset.date.max()

'2019-09-28'

In [141]:
test_set=dataset[dataset.date>="2016-06-11"]

In [142]:
train_set=dataset[dataset.date<"2016-06-11"]

In [143]:
len(test_set)/len(dataset)

0.2030675624286982

In [144]:
ytrain=train_set["top10"]
ytest=test_set["top10"]
train=train_set.drop(["date","top10","bestrank"],axis=1)
test=test_set.drop(["date","top10","bestrank"],axis=1)

In [145]:
train.info() #exist NaN values

<class 'pandas.core.frame.DataFrame'>
Int64Index: 6287 entries, 1864 to 8518
Data columns (total 25 columns):
spotify_explicit            6287 non-null object
spotify_duration_ms         6287 non-null float64
spotify_track_number        6287 non-null float64
spotify_danceability        6287 non-null float64
spotify_energy              6287 non-null float64
spotify_loudness            6287 non-null float64
spotify_mode                6287 non-null float64
spotify_speechiness         6287 non-null float64
spotify_acousticness        6287 non-null float64
spotify_instrumentalness    6287 non-null float64
spotify_liveness            6287 non-null float64
spotify_valence             6287 non-null float64
spotify_tempo               6287 non-null float64
spotify_time_signature      6287 non-null float64
num_artists                 6287 non-null float64
award_num                   6287 non-null float64
gold_count                  6287 non-null float64
platinum_count              6287 non-null

In [146]:
le = preprocessing.LabelEncoder()
train['artist_has_award'] = le.fit_transform(train['artist_has_award'])
train['label_category_group'] = le.fit_transform(train['label_category_group'].astype(str))
train['album_type'] = le.fit_transform(train['album_type'].astype(str))
train['spotify_explicit'] = le.fit_transform(train['spotify_explicit'].astype(str))
train["datetime_year"]=le.fit_transform(train["datetime_year"])
train["datetime_month"]=le.fit_transform(train["datetime_month"])

test['artist_has_award'] = le.fit_transform(test['artist_has_award'])
test['label_category_group'] = le.fit_transform(test['label_category_group'].astype(str))
test['album_type'] = le.fit_transform(test['album_type'].astype(str))
test["datetime_year"]=le.fit_transform(test["datetime_year"])
test["datetime_month"]=le.fit_transform(test["datetime_month"])
test['spotify_explicit'] = le.fit_transform(test['spotify_explicit'].astype(str))


In [147]:
test.dtypes  

spotify_explicit              int64
spotify_duration_ms         float64
spotify_track_number        float64
spotify_danceability        float64
spotify_energy              float64
spotify_loudness            float64
spotify_mode                float64
spotify_speechiness         float64
spotify_acousticness        float64
spotify_instrumentalness    float64
spotify_liveness            float64
spotify_valence             float64
spotify_tempo               float64
spotify_time_signature      float64
num_artists                 float64
award_num                   float64
gold_count                  float64
platinum_count              float64
artist_has_award              int64
num_songs_awards            float64
firstrank                     int64
label_category_group          int64
album_type                    int64
datetime_year                 int64
datetime_month                int64
dtype: object

In [148]:
ytrain

1864     True
1865     True
1866    False
1867    False
1868    False
        ...  
8514    False
8515    False
8516    False
8517    False
8518    False
Name: top10, Length: 6287, dtype: bool

In [149]:
ytrain=pd.DataFrame(le.fit_transform(ytrain),columns=["top10"])
ytest=pd.DataFrame(le.fit_transform(ytest),columns=["top10"])

In [150]:
train.to_pickle("train_set.pkl")
ytrain.to_pickle("ytrain.pkl")
test.to_pickle("test_set.pkl")
ytest.to_pickle("ytest.pkl")

In [113]:
from sklearn.ensemble import RandomForestClassifier

rfc = RandomForestClassifier(random_state=42)

#basic random forest
rfc.fit(train,ytrain.values.ravel())



RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
                       max_depth=None, max_features='auto', max_leaf_nodes=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, n_estimators=10,
                       n_jobs=None, oob_score=False, random_state=42, verbose=0,
                       warm_start=False)

In [114]:
rfc_predict = rfc.predict(test)
print(confusion_matrix(rfc_predict,ytest))

[[1471   93]
 [  35    3]]


In [46]:
#CV to fine tune the parameters

param_grid = {
    'min_samples_split': [3, 5, 10], 
    'n_estimators' : [100, 300],
    'max_depth': [3, 5, 15, 20],
    'max_features': [3, 5, 10, 15]
}

scorers = {
    'precision_score': make_scorer(precision_score),
    'accuracy_score': make_scorer(accuracy_score),
    'recall_score': make_scorer(recall_score)
}



In [47]:
def grid_search_wrapper(refit_score='recall_score'):
    """
    fits a GridSearchCV classifier using refit_score for optimization
    prints classifier performance metrics
    """
    skf = StratifiedKFold(n_splits=10) #The folds are made by preserving the percentage of samples for each class.
    grid_search = GridSearchCV(rfc, param_grid, scoring=scorers, refit=refit_score,
                           cv=skf, return_train_score=True, n_jobs=-1)
    grid_search.fit(train, ytrain.values.ravel())

    # make the predictions
    y_pred = grid_search.predict(test)

    print('Best params for {}'.format(refit_score))
    print(grid_search.best_params_)

    # confusion matrix on the test data.
    print('\nConfusion matrix of Random Forest optimized for {} on the test data:'.format(refit_score))
    print(pd.DataFrame(confusion_matrix(ytest, y_pred),
                 columns=['pred_neg', 'pred_pos'], index=['neg', 'pos']))
    return grid_search

In [48]:
model=grid_search_wrapper()

KeyboardInterrupt: 

In [None]:
features_importances=model.best_estimator_.feature_importances_
features=train.columns

In [None]:
for i in range(len(features)):
    print ("Importance of "+features[i]+" is: ")
    print(features_importances[i])