In [1]:
import pdb
import numpy as np
import pandas as pd
import statsmodels.api as sm
from sklearn.preprocessing import MinMaxScaler
import matplotlib.pyplot as plt
from sklearn.feature_selection import RFE
from sklearn.linear_model import LinearRegression
%matplotlib inline
import seaborn as sns

In [2]:
data = pd.read_csv("../data/interum/top_genres.csv")

In [3]:
def plot_scatter_relationship(data, x, y="Energy"):
    plt.scatter(x=data[x], y=data[y], alpha=0.1)
    plt.xlabel(x)
    plt.ylabel(y)
    plt.title(f"{x} vs. {y}")

In [4]:
def plot_distributions(data, x):
    scaler = MinMaxScaler()
    sns.distplot(a=scaler.fit(np.array(data[x]).reshape(-1,1)).transform(np.array(data[x]).reshape(-1,1)), bins=500)
    plt.title(x)
    plt.show()


In [5]:
def adjusted_r_squared(r_squared, num_samples, num_regressors):
    return 1 - ((1-r_squared)*(num_samples - 1) / (num_samples - num_regressors - 1))

## Linear Regression Predicting _Energy_

subset data relevant for predicting energy: 

In [6]:
energy_reg_df = data[[
    "Danceability",
    "Energy",
    "Loudness",
    "Mode",
    "Speechness",
    "Acousticness",
    "Instrumentalness",
    "Liveness",
    "Valence",
    "Tempo",
    "Duration_ms",
    'Super_genre',
    'time_signature'
]]
energy_reg_df.head()

Unnamed: 0,Danceability,Energy,Loudness,Mode,Speechness,Acousticness,Instrumentalness,Liveness,Valence,Tempo,Duration_ms,Super_genre,time_signature
0,0.624,0.857,-6.25,0.0,0.0542,0.0208,0.206,0.11,0.324,131.926,282920.0,metal,4.0
1,0.517,0.916,-4.933,1.0,0.0559,0.000182,0.00191,0.306,0.444,135.996,300320.0,metal,4.0
2,0.251,0.894,-4.103,0.0,0.057,0.0144,0.0,0.123,0.297,114.223,175353.0,metal,4.0
3,0.469,0.743,-5.57,0.0,0.0272,0.00222,0.000111,0.276,0.481,86.953,272292.0,metal,4.0
4,0.487,0.952,-4.429,0.0,0.0613,0.000228,0.0,0.161,0.329,125.993,237933.0,metal,4.0


Duration appears to benefit from a log transformation, so we add that here

In [7]:
energy_reg_df['log_duration'] = np.log(energy_reg_df.Duration_ms)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  """Entry point for launching an IPython kernel.


## Dummy Variables

* Mode - basically major or minor
* Key - 0 - 11 for the 12 pitch names
* time_signature - some integer representing the numerator in the time signature

#### Step 1
cast all categorical variables to strings so we can use pd.get_dummies()

In [8]:
energy_reg_df.Mode = energy_reg_df.Mode.astype(str)
energy_reg_df.time_signature = energy_reg_df.time_signature.astype(str)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  self[name] = value


#### step 2

generate dummy variables using get_dummies()

In [9]:
energy_reg_df=pd.get_dummies(energy_reg_df)
energy_reg_df.head()

Unnamed: 0,Danceability,Energy,Loudness,Speechness,Acousticness,Instrumentalness,Liveness,Valence,Tempo,Duration_ms,...,Super_genre_step,Super_genre_swing,Super_genre_synth,Super_genre_techno,Super_genre_worship,time_signature_0.0,time_signature_1.0,time_signature_3.0,time_signature_4.0,time_signature_5.0
0,0.624,0.857,-6.25,0.0542,0.0208,0.206,0.11,0.324,131.926,282920.0,...,0,0,0,0,0,0,0,0,1,0
1,0.517,0.916,-4.933,0.0559,0.000182,0.00191,0.306,0.444,135.996,300320.0,...,0,0,0,0,0,0,0,0,1,0
2,0.251,0.894,-4.103,0.057,0.0144,0.0,0.123,0.297,114.223,175353.0,...,0,0,0,0,0,0,0,0,1,0
3,0.469,0.743,-5.57,0.0272,0.00222,0.000111,0.276,0.481,86.953,272292.0,...,0,0,0,0,0,0,0,0,1,0
4,0.487,0.952,-4.429,0.0613,0.000228,0.0,0.161,0.329,125.993,237933.0,...,0,0,0,0,0,0,0,0,1,0


## Linear Regression Model

Using energy as our target

In [10]:
# Set dependent and independent variables
y = energy_reg_df.Energy
X = energy_reg_df.drop('Energy', axis=1)
# Get dummy variables and add constant for intercept
X = sm.add_constant(X)

# Split data 
#X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, train_size=0.7)

# fit and summarize your model
model = sm.OLS(y,X).fit()
model.summary()

  return ptp(axis=axis, out=out, **kwargs)


0,1,2,3
Dep. Variable:,Energy,R-squared:,0.768
Model:,OLS,Adj. R-squared:,0.768
Method:,Least Squares,F-statistic:,8516.0
Date:,"Mon, 29 Jul 2019",Prob (F-statistic):,0.0
Time:,14:41:37,Log-Likelihood:,75529.0
No. Observations:,107973,AIC:,-151000.0
Df Residuals:,107930,BIC:,-150600.0
Df Model:,42,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
const,0.3912,0.013,29.466,0.000,0.365,0.417
Danceability,-0.1682,0.003,-64.554,0.000,-0.173,-0.163
Loudness,0.0279,0.000,264.884,0.000,0.028,0.028
Speechness,0.2247,0.004,57.103,0.000,0.217,0.232
Acousticness,-0.2756,0.002,-180.811,0.000,-0.279,-0.273
Instrumentalness,0.0723,0.001,59.354,0.000,0.070,0.075
Liveness,0.1048,0.002,45.262,0.000,0.100,0.109
Valence,0.1579,0.002,91.481,0.000,0.155,0.161
Tempo,0.0003,1.31e-05,24.184,0.000,0.000,0.000

0,1,2,3
Omnibus:,2239.112,Durbin-Watson:,1.554
Prob(Omnibus):,0.0,Jarque-Bera (JB):,3786.753
Skew:,-0.187,Prob(JB):,0.0
Kurtosis:,3.838,Cond. No.,6.79e+16


## Variable Selection work

the code below will step through adding variables to our regression and provide the r squared and adjusted r squared scores along the way.  

_Note: depending on the number of variables, this will take a while to run._

In [11]:
# y = energy_reg_df.Energy
# predictors = energy_reg_df.drop('Energy', axis=1)



# linreg = LinearRegression()
# for i in range(1,energy_reg_df.shape[1]):
#     print("\n\n")
#     print('Num Variables', i)
#     selector = RFE(linreg, n_features_to_select=i)

#     selector = selector.fit(X=predictors, y=y)

#     print("Which columns were kept:\t", selector.support_)
# #     print("The coefficients:\t\t", selector.estimator_.coef_)
# #     print("The intercept:\t\t\t", selector.estimator_.intercept_)
#     r_squared = selector.score(predictors,y)
#     adj_r_squared = adjusted_r_squared(r_squared, energy_reg_df.shape[0], i)
    
#     pdb.set_trace()
    
#     print("R Squared\t\t", r_squared)
#     print("Adjusted R Squared\t", adj_r_squared)