In [92]:
import pandas as pd
import numpy as np
from sklearn.linear_model import LinearRegression
from sklearn import preprocessing
from sklearn.neural_network import MLPRegressor
from sklearn.neighbors import KNeighborsRegressor
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn.svm import SVR

In [58]:
songDf = pd.read_csv("songData.csv")

In [59]:
songDf = songDf.drop(['Unnamed: 0', 'Unnamed: 0.1', 'analysis_url', 'id','track_href','type', 'uri', 'artist',
                       'lyric', 'name'], axis = 1)

In [60]:
songDf = songDf.sample(frac = 1, replace = False)

In [61]:
songDf = pd.get_dummies(songDf, columns=['key','time_signature', 'genre'])

In [62]:
year = []

for date in songDf.date:
    year.append(int(date[0:4]))


In [63]:
songDf.date = year

In [64]:
train_values = songDf.popularity.iloc[0:8000]
test_values = songDf.popularity.iloc[8000:11300]

In [65]:
train = songDf.iloc[0:8000]
test = songDf.iloc[8000:11300]

In [66]:
train = train.drop(['popularity'], axis = 1)
test = test.drop(['popularity'], axis = 1)

In [67]:
train = preprocessing.scale(train)
test = preprocessing.scale(test)

In [96]:
def fold(model, trainData, labels, n_folds):
    accs = 0

    
    
    
    for x in range(0,n_folds):
        start = x * len(trainData)/n_folds
        end = (x+1) * len(trainData)/n_folds
        index = range(0,start) + range(end,len(trainData))
        fit = model.fit(X = trainData[index], y = labels.iloc[index] )
        score = fit.score(trainData[start:end], labels.iloc[start:end])
        accs = accs + score
    
    accs = accs/float(n_folds)
    
    return((accs))

In [104]:
####Trying Multi Layer Neural Network First - Cause I'm so Deep

mlnn = MLPRegressor(solver = "adam")
fit = mlnn.fit(train, train_values)
fold(mlnn, train, train_values, 4 )

0.25986720641829236

In [105]:
fit.score(test,test_values)

0.30442891566235397

In [106]:
####Trying straight up linear regression

lin = LinearRegression()
fit = lin.fit(train, train_values)
fold(lin, train, train_values, 4 )

-1.9763914921247865e+23

In [107]:
fit.score(test,test_values)

-2.0430517901890814e+21

In [108]:
####Trying Nearest Neighbor Regression - using default of 5 neighbours
knn = KNeighborsRegressor()
fit = knn.fit(train, train_values)
fold(knn, train, train_values, 4 )

0.20613879840533758

In [109]:
fit.score(test,test_values)

0.2274270603762919

In [110]:
####Trying SVM
model = SVR(kernel = "rbf")
fit = model.fit(train, train_values)
fold(model, train, train_values, 4 )

0.28131665266348771

In [111]:
fit.score(test,test_values)

0.29022493509764269

In [112]:
####Trying Random Forest
model = RandomForestRegressor(n_estimators = 50)
fit = model.fit(train, train_values)
fold(model, train, train_values, 4 )

0.34441671468952584

In [113]:
fit.score(test,test_values)

0.34074401263171616

In [85]:
fit.feature_importances_

array([  5.89190774e-02,   5.71405097e-02,   6.06758036e-02,
         5.40184316e-02,   4.45280456e-02,   5.94561080e-02,
         6.70422338e-02,   5.91810240e-03,   5.64667273e-02,
         6.03958285e-02,   5.81663391e-02,   7.22883601e-02,
         3.91050045e-03,   4.40410877e-03,   3.94268931e-03,
         1.27231112e-03,   3.41944759e-03,   3.48784049e-03,
         4.05375199e-03,   4.62271862e-03,   3.74232684e-03,
         4.47777714e-03,   2.72232134e-03,   4.15918109e-03,
         1.40220151e-05,   1.44217422e-04,   1.21342905e-03,
         1.34069136e-03,   4.28604790e-04,   7.09672344e-02,
         1.53586956e-03,   2.55312204e-02,   3.28896206e-02,
         1.31016439e-01,   3.22634571e-02,   1.46361259e-03,
         1.96103973e-03])

In [87]:
songDf.columns

Index([u'acousticness', u'danceability', u'duration_ms', u'energy',
       u'instrumentalness', u'liveness', u'loudness', u'mode', u'speechiness',
       u'tempo', u'valence', u'date', u'popularity', u'key_0', u'key_1',
       u'key_2', u'key_3', u'key_4', u'key_5', u'key_6', u'key_7', u'key_8',
       u'key_9', u'key_10', u'key_11', u'time_signature_0',
       u'time_signature_1', u'time_signature_3', u'time_signature_4',
       u'time_signature_5', u'genre_blues', u'genre_country',
       u'genre_electric', u'genre_jazz', u'genre_metal', u'genre_pop',
       u'genre_rap', u'genre_rock'],
      dtype='object')

In [115]:
values = pd.DataFrame(fit.predict(test))

In [126]:
 test_values.reset_index()['popularity'];

In [127]:
values['true'] =  test_values.reset_index()['popularity']

In [131]:
values

Unnamed: 0,0,true
0,67.920000,69
1,42.260000,51
2,64.200000,2
3,12.660000,7
4,33.041667,33
5,11.000000,19
6,58.500000,61
7,26.040000,33
8,22.400000,21
9,37.840000,27


In [132]:
testing = songDf.iloc[8000:11500]

In [136]:
values['year'] = testing.date.reset_index()['date']

In [138]:
values.to_csv("popPrediction.csv")