In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import sklearn
import seaborn as sns
from sklearn.model_selection import (train_test_split,cross_val_predict,
                                     cross_val_score,cross_validate,GridSearchCV,StratifiedKFold
                                    )
from sklearn import ensemble, linear_model,svm,neighbors,neural_network
from sklearn.metrics import (classification_report,confusion_matrix,
                             accuracy_score,auc,mean_absolute_error,r2_score,mean_squared_error,
                             plot_roc_curve,roc_auc_score,roc_curve)
from sklearn.preprocessing import (OneHotEncoder,LabelEncoder,MinMaxScaler,StandardScaler,)
from sklearn.decomposition import PCA,kernel_pca,KernelPCA
from sklearn.feature_selection import SelectKBest, chi2
from xgboost import XGBClassifier,XGBRegressor
import lightgbm as lgb
warnings.filterwarnings("ignore")



In [None]:
df=pd.read_csv("Train_data.csv")
df.head() 

Unnamed: 0,id,acousticness,danceability,energy,explicit,instrumentalness,key,liveness,loudness,mode,release_date,speechiness,tempo,valence,year,duration-min,popularity
0,2015,0.949,0.235,0.0276,No,0.927,5,0.513,-27.398,Major,01-01-1947,0.0381,110.838,0.0398,1947,3.0,very low
1,15901,0.855,0.456,0.485,No,0.0884,4,0.151,-10.046,Major,13-11-2020,0.0437,152.066,0.859,2020,2.4,low
2,9002,0.827,0.495,0.499,No,0.0,0,0.401,-8.009,Minor,01-01-1950,0.0474,108.004,0.709,1950,2.6,very low
3,6734,0.654,0.643,0.469,No,0.108,7,0.218,-15.917,Major,30-04-1974,0.0368,83.636,0.964,1974,2.4,low
4,15563,0.738,0.705,0.311,No,0.0,5,0.322,-12.344,Major,01-01-1973,0.0488,117.26,0.785,1973,3.4,average


In [None]:
def preprocessing(df, istest=False,isOutlier_removal=True):
    df=df.replace({"explicit":{"Yes":1,"No":0},
              "mode":{"Major":1,"Minor":0},})
    if not istest:
        df.popularity=df.popularity.replace({'very low': 1, 'low': 2, 'average': 3, 'high':4, 'very high': 5})
    
    release_date=pd.to_datetime(df.release_date)
    df=df.drop(columns=["id","release_date"])
    
    if isOutlier_removal:
        df.tempo[df.tempo==0]=39.875
        df["duration-min"]=df["duration-min"].clip(upper=10)
    return df,release_date
        
    
def feature_engineering(df,isdrop=True):
    df["day"]=release_date.dt.day
    
    #you may onhot encode the this feature
    df["day_of_week"]=release_date.dt.dayofweek
    
    df["day_of_year"]=release_date.dt.dayofyear
    df["month"]=release_date.dt.month
  
        
#     You may not drop key use it as a onehotencoder on it
    if isdrop:
        df=drop_feature(df,["month","key"])
    return df

In [None]:
# Plot distribution of a features for each class
def distWithPopularity(f,p):
    for val in p.unique():
        sns.distplot(f[p==val],label=str(val))
    plt.legend(list(map(str,p.unique())))
    plt.show()
    
def add_feature(x,f):
    return pd.concat([x,f],axis=1)

def drop_feature(x,cols):
    return x.drop(columns=cols,axis=1)

def get_revenue(y_true,y_pred):
    bid2revenue={1:2,2:4,3:6,4:8,5:10}
    r1_l=[bid2revenue[bid] for bid in y_true[y_pred==y_true]]
    r1=sum(r1_l)
    r2_l=[bid2revenue[a] if b>a else 0 for a,b in zip(list(y_true[y_pred!=y_true]),list(y_pred[y_pred!=y_true]))]
    r2=sum(r2_l)
    return r1+r2

def model_classifier(rmf,x,y,isval=False):
    if not isval:
        rmf.fit(x,y)
    y_pred=rmf.predict(x)
    if isval:
        print("\nValidation:")
    else:
        print("\nTrain:")
    print(classification_report(y,y_pred,digits=4))
    print()
    print(confusion_matrix(y,y_pred))
    print("-"*50)
    return rmf

def model_regressor(rmf,x,y,isval=False):
    if not isval:
        rmf.fit(x,y)
    y_pred=rmf.predict(x)
    if isval:
        print("\nValidation:")
    else:
        print("\nTrain:")
    print("R2 score: ",r2_score(y,y_pred))
    print("mae error: ",mean_absolute_error(y,y_pred))
    print("-"*50)
    return rmf,y_pred

In [None]:
df,release_date=preprocessing(df)
df=feature_engineering(df)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  # This is added back by InteractiveShellApp.init_path()


In [None]:
df

Unnamed: 0,acousticness,danceability,energy,explicit,instrumentalness,liveness,loudness,mode,speechiness,tempo,valence,year,duration-min,popularity,day,day_of_week,day_of_year
0,0.9490,0.2350,0.0276,0,0.927000,0.513,-27.398,1,0.0381,110.838,0.03980,1947,3.0,1,1,2,1
1,0.8550,0.4560,0.4850,0,0.088400,0.151,-10.046,1,0.0437,152.066,0.85900,2020,2.4,2,13,4,318
2,0.8270,0.4950,0.4990,0,0.000000,0.401,-8.009,0,0.0474,108.004,0.70900,1950,2.6,1,1,6,1
3,0.6540,0.6430,0.4690,0,0.108000,0.218,-15.917,1,0.0368,83.636,0.96400,1974,2.4,2,30,1,120
4,0.7380,0.7050,0.3110,0,0.000000,0.322,-12.344,1,0.0488,117.260,0.78500,1973,3.4,3,1,0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
12222,0.0408,0.8090,0.8010,0,0.000000,0.353,-5.461,1,0.4070,81.940,0.74400,2014,3.4,3,7,1,7
12223,0.9120,0.4510,0.2400,0,0.000002,0.175,-14.014,1,0.0351,134.009,0.70100,1959,2.0,5,1,3,1
12224,0.3280,0.5510,0.5640,0,0.002950,0.352,-9.298,0,0.0338,124.883,0.89000,1984,2.5,2,1,6,1
12225,0.1220,0.0608,0.9390,0,0.991000,0.912,-26.324,1,0.1180,73.234,0.00558,2017,3.1,4,1,4,244


In [None]:
X=df.drop(columns=["popularity","day_of_week"])
y=df.popularity
x_train,x_test,y_train,y_test=train_test_split(X,y,stratify=y,shuffle=True,test_size=0.2, random_state = 123)


#feature scaling to 0-1 
scaler=MinMaxScaler()
col=['day','loudness','tempo', 'year','duration-min', 'day_of_year']
x_train[col]=scaler.fit_transform(x_train[col])
x_test[col]=scaler.transform(x_test[col])

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  if __name__ == '__main__':
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  isetter(loc, value[:, i].tolist())
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  # Remove the CWD from sys.path while we load stuff.
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_index

In [None]:
def model(rmf,x,y,isval=False):
    if not isval:
        rmf.fit(x,y)
    y_pred=rmf.predict(x)
    if isval:
        print("\nValidation:")
    else:
        print("\nTrain:")
    print(classification_report(y,y_pred))
    print()
    print(confusion_matrix(y,y_pred))
    print("-"*50)
    return rmf

In [None]:
from sklearn.svm import SVC 
from sklearn.model_selection import GridSearchCV
parameters = {'C': [10, 100],  
              'gamma': [1, 0.1, 0.01, 0.001], 
              'kernel': ['rbf']}  

In [None]:
cv=StratifiedKFold(shuffle=True,random_state=0)

In [None]:
grid_search = GridSearchCV(estimator = SVC(),
                           param_grid = parameters,
                           scoring = 'accuracy',
                           cv = 10,
                           refit= True,
                           verbose=3)

In [None]:
grid_search.fit(x_train, y_train)

Fitting 10 folds for each of 50 candidates, totalling 500 fits
[CV] C=0.1, class_weight=balanced, gamma=1, kernel=rbf ...............


[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.


[CV]  C=0.1, class_weight=balanced, gamma=1, kernel=rbf, score=0.434, total=   6.8s
[CV] C=0.1, class_weight=balanced, gamma=1, kernel=rbf ...............


[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    6.8s remaining:    0.0s


[CV]  C=0.1, class_weight=balanced, gamma=1, kernel=rbf, score=0.479, total=   7.0s
[CV] C=0.1, class_weight=balanced, gamma=1, kernel=rbf ...............


[Parallel(n_jobs=1)]: Done   2 out of   2 | elapsed:   13.8s remaining:    0.0s


[CV]  C=0.1, class_weight=balanced, gamma=1, kernel=rbf, score=0.497, total=   7.0s
[CV] C=0.1, class_weight=balanced, gamma=1, kernel=rbf ...............
[CV]  C=0.1, class_weight=balanced, gamma=1, kernel=rbf, score=0.455, total=   6.8s
[CV] C=0.1, class_weight=balanced, gamma=1, kernel=rbf ...............
[CV]  C=0.1, class_weight=balanced, gamma=1, kernel=rbf, score=0.487, total=   6.7s
[CV] C=0.1, class_weight=balanced, gamma=1, kernel=rbf ...............
[CV]  C=0.1, class_weight=balanced, gamma=1, kernel=rbf, score=0.453, total=   6.7s
[CV] C=0.1, class_weight=balanced, gamma=1, kernel=rbf ...............
[CV]  C=0.1, class_weight=balanced, gamma=1, kernel=rbf, score=0.469, total=   6.8s
[CV] C=0.1, class_weight=balanced, gamma=1, kernel=rbf ...............
[CV]  C=0.1, class_weight=balanced, gamma=1, kernel=rbf, score=0.431, total=   6.9s
[CV] C=0.1, class_weight=balanced, gamma=1, kernel=rbf ...............
[CV]  C=0.1, class_weight=balanced, gamma=1, kernel=rbf, score=0.470, 

[Parallel(n_jobs=1)]: Done 500 out of 500 | elapsed: 57.9min finished


GridSearchCV(cv=10, error_score=nan,
             estimator=SVC(C=1.0, break_ties=False, cache_size=200,
                           class_weight=None, coef0=0.0,
                           decision_function_shape='ovr', degree=3,
                           gamma='scale', kernel='rbf', max_iter=-1,
                           probability=False, random_state=None, shrinking=True,
                           tol=0.001, verbose=False),
             iid='deprecated', n_jobs=None,
             param_grid={'C': [0.1, 1, 10, 50, 100],
                         'class_weight': ['balanced',
                                          {1: 1, 2: 1, 3: 1, 4: 1, 5: 5}],
                         'gamma': [1, 0.1, 0.01, 0.001, 0.0001],
                         'kernel': ['rbf']},
             pre_dispatch='2*n_jobs', refit=True, return_train_score=False,
             scoring='accuracy', verbose=3)

In [None]:
print(grid_search .best_params_) 
  
# print how our model looks after hyper-parameter tuning 
print(grid_search .best_estimator_)

{'C': 100, 'class_weight': {1: 1, 2: 1, 3: 1, 4: 1, 5: 5}, 'gamma': 0.1, 'kernel': 'rbf'}
SVC(C=100, break_ties=False, cache_size=200,
    class_weight={1: 1, 2: 1, 3: 1, 4: 1, 5: 5}, coef0=0.0,
    decision_function_shape='ovr', degree=3, gamma=0.1, kernel='rbf',
    max_iter=-1, probability=False, random_state=None, shrinking=True,
    tol=0.001, verbose=False)


In [None]:
import warnings
svc=SVC(C=100,gamma=0.1,kernel='rbf')
rmf=model(svc,x_train,y_train)
rmf=model(svc,x_test,y_test,isval=True)


Train:
              precision    recall  f1-score   support

           1       0.87      0.78      0.82      2577
           2       0.57      0.74      0.65      2494
           3       0.54      0.49      0.52      2330
           4       0.60      0.61      0.61      2085
           5       0.00      0.00      0.00       295

    accuracy                           0.64      9781
   macro avg       0.52      0.53      0.52      9781
weighted avg       0.63      0.64      0.63      9781


[[2010  268   63  236    0]
 [ 194 1848  423   29    0]
 [  59  805 1140  326    0]
 [  56  285  462 1282    0]
 [   2   14    7  272    0]]
--------------------------------------------------

Validation:
              precision    recall  f1-score   support

           1       0.86      0.78      0.82       645
           2       0.54      0.71      0.62       624
           3       0.51      0.47      0.49       582
           4       0.61      0.60      0.60       521
           5       0.00   

In [None]:
y_pred=svc.predict(x_test)
get_revenue(y_test,y_pred)

8236

In [None]:
param = {'C': [10, 100],  
              'gamma': [1, 0.1, 0.01, 0.001], 
              'kernel': ['rbf']}
  

In [None]:
from sklearn.svm import SVC 
from sklearn.ensemble import VotingClassifier 
estimator = [] 
estimator.append(('RMF', ensemble.RandomForestClassifier(n_estimators=500,min_samples_leaf=5,max_features="sqrt",n_jobs=-1)))
estimator.append(('SVC', SVC(C=100,kernel="rbf",gamma ='auto', probability = True))) 
estimator.append(('LGB',lgb.LGBMClassifier()))
estimator.append(('XGB',XGBClassifier(colsample_bylevel=0.8,colsample_bynode=1, colsample_bytree=0.8, gamma=0,learning_rate=0.005, max_depth=8,min_child_weight=1,n_estimators=1000, n_jobs=-1,objective='multi:softprob', reg_alpha=0,reg_lambda=1,  subsample=0.8)))

In [None]:
grid_search = GridSearchCV(estimator = SVC(),
                           param_grid = param,
                           scoring = 'accuracy',
                           cv = 10,
                           refit= True,
                           verbose=3)
grid_search.fit(x_train, y_train)

Fitting 10 folds for each of 8 candidates, totalling 80 fits
[CV] C=10, gamma=1, kernel=rbf .......................................


[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.


[CV] ........... C=10, gamma=1, kernel=rbf, score=0.562, total=   5.9s
[CV] C=10, gamma=1, kernel=rbf .......................................


[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    5.9s remaining:    0.0s


[CV] ........... C=10, gamma=1, kernel=rbf, score=0.562, total=   6.2s
[CV] C=10, gamma=1, kernel=rbf .......................................


[Parallel(n_jobs=1)]: Done   2 out of   2 | elapsed:   12.1s remaining:    0.0s


[CV] ........... C=10, gamma=1, kernel=rbf, score=0.571, total=   6.2s
[CV] C=10, gamma=1, kernel=rbf .......................................
[CV] ........... C=10, gamma=1, kernel=rbf, score=0.573, total=   6.2s
[CV] C=10, gamma=1, kernel=rbf .......................................
[CV] ........... C=10, gamma=1, kernel=rbf, score=0.596, total=   6.6s
[CV] C=10, gamma=1, kernel=rbf .......................................
[CV] ........... C=10, gamma=1, kernel=rbf, score=0.566, total=   6.6s
[CV] C=10, gamma=1, kernel=rbf .......................................
[CV] ........... C=10, gamma=1, kernel=rbf, score=0.580, total=   6.6s
[CV] C=10, gamma=1, kernel=rbf .......................................
[CV] ........... C=10, gamma=1, kernel=rbf, score=0.589, total=   6.6s
[CV] C=10, gamma=1, kernel=rbf .......................................
[CV] ........... C=10, gamma=1, kernel=rbf, score=0.567, total=   6.2s
[CV] C=10, gamma=1, kernel=rbf .......................................
[CV] .

[Parallel(n_jobs=1)]: Done  80 out of  80 | elapsed:  8.8min finished


GridSearchCV(cv=10, error_score=nan,
             estimator=SVC(C=1.0, break_ties=False, cache_size=200,
                           class_weight=None, coef0=0.0,
                           decision_function_shape='ovr', degree=3,
                           gamma='scale', kernel='rbf', max_iter=-1,
                           probability=False, random_state=None, shrinking=True,
                           tol=0.001, verbose=False),
             iid='deprecated', n_jobs=None,
             param_grid={'C': [10, 100], 'gamma': [1, 0.1, 0.01, 0.001],
                         'kernel': ['rbf']},
             pre_dispatch='2*n_jobs', refit=True, return_train_score=False,
             scoring='accuracy', verbose=3)

In [None]:
print(grid_search .best_params_) 
  
# print how our model looks after hyper-parameter tuning 
print(grid_search .best_estimator_)

{'C': 100, 'gamma': 0.1, 'kernel': 'rbf'}
SVC(C=100, break_ties=False, cache_size=200, class_weight=None, coef0=0.0,
    decision_function_shape='ovr', degree=3, gamma=0.1, kernel='rbf',
    max_iter=-1, probability=False, random_state=None, shrinking=True,
    tol=0.001, verbose=False)


In [None]:
svc=SVC(C=1,gamma=0.1,kernel='rbf')
rmf=model(svc,x_train,y_train)
rmf=model(svc,x_test,y_test,isval=True)


Train:
              precision    recall  f1-score   support

           1       0.78      0.66      0.72      2577
           2       0.55      0.64      0.59      2494
           3       0.49      0.50      0.50      2330
           4       0.52      0.58      0.55      2085
           5       0.00      0.00      0.00       295

    accuracy                           0.58      9781
   macro avg       0.47      0.48      0.47      9781
weighted avg       0.57      0.58      0.57      9781


[[1706  288  147  436    0]
 [ 355 1590  473   76    0]
 [  91  730 1168  341    0]
 [  34  264  576 1211    0]
 [   3   11   23  258    0]]
--------------------------------------------------


  _warn_prf(average, modifier, msg_start, len(result))



Validation:
              precision    recall  f1-score   support

           1       0.77      0.66      0.71       645
           2       0.53      0.62      0.57       624
           3       0.47      0.47      0.47       582
           4       0.51      0.57      0.54       521
           5       0.00      0.00      0.00        74

    accuracy                           0.57      2446
   macro avg       0.46      0.46      0.46      2446
weighted avg       0.56      0.57      0.56      2446


[[424  93  29  99   0]
 [100 390 118  16   0]
 [ 19 192 275  96   0]
 [  8  63 155 295   0]
 [  0   2   4  68   0]]
--------------------------------------------------


  _warn_prf(average, modifier, msg_start, len(result))


In [None]:
rmf=model(vote,x_test,y_test,isval=True)


Validation:
              precision    recall  f1-score   support

           1       0.82      0.83      0.83       645
           2       0.55      0.71      0.62       624
           3       0.52      0.47      0.49       582
           4       0.65      0.57      0.61       521
           5       0.75      0.12      0.21        74

    accuracy                           0.64      2446
   macro avg       0.66      0.54      0.55      2446
weighted avg       0.64      0.64      0.63      2446


[[537  52  21  35   0]
 [ 77 443 102   2   0]
 [ 20 222 274  66   0]
 [ 15  80 127 296   3]
 [  2   3   3  57   9]]
--------------------------------------------------


In [None]:
y_pred = rmf.predict(x_test)


In [None]:
y_pred

array([4, 2, 1, ..., 3, 1, 3], dtype=int64)

In [None]:
get_revenue(y_test, y_pred)

10910