# Linear Regression Modeling

In [20]:
# Requirements
import pandas as pd
import numpy as np

np.random.seed(42)

In [2]:
# Load data
df = pd.read_csv(r'era_tour_setlist.csv').set_index('track_name')

# Process data
df['is_explicit'] = df['is_explicit'].astype(int)

df = df.select_dtypes(include=[int, float])
print("Size of dataset, ", df.shape)
df.head(2)

Size of dataset,  (44, 15)


Unnamed: 0_level_0,is_explicit,danceability,valence,energy,loudness,acousticness,instrumentalness,liveness,speechiness,key,tempo,mode,duration_ms,time_signature,popularity
track_name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1
Miss Americana & The Heartbreak Prince,0,0.662,0.487,0.747,-6.926,0.028,0.00615,0.138,0.0736,11,150.088,0,234147,4,79
Cruel Summer,0,0.552,0.564,0.702,-5.707,0.117,2.1e-05,0.105,0.157,9,169.994,1,178427,4,94


In [4]:
# Featture selection
df_pred = df[['valence', 'energy', 'mode', 'is_explicit', 'speechiness', 'liveness', 'key', 'tempo', 'popularity']]

### Linear Regression with Sklearn packages

In [71]:
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, r2_score

# Split training and testing dataset
X = df_pred.drop('popularity', axis=1)
y = df_pred['popularity']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
print("Training dataset size: ", X_train.shape)
print("Testing dataset size: ", X_test.shape)

# Standardization
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# modeling
model = LinearRegression()
model.fit(X_train_scaled, y_train)
y_pred = model.predict(X_test_scaled)

# evaluation
mse = mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

print("Mean Squared Error: ", mse)
print("R square: ", r2)

Training dataset size:  (35, 8)
Testing dataset size:  (9, 8)
Mean Squared Error:  41.37862077160974
R square:  0.025677824854537956


### Manual Linear Regression

Only focus on the mannual implementation of Multiple Linear Regression, not need to concern the result of metrics.

In [96]:
# Randomly split training and testing dataset
X = df_pred.drop('popularity', axis=1)
y = df_pred['popularity']

np.random.seed(42)
indices = np.random.permutation(df_pred.shape[0])
train_size = int(0.8*df_pred.shape[0])
train_indices = indices[:train_size]
test_indices = indices[train_size:]

X_train, y_train, X_test, y_test = X.iloc[train_indices], y.iloc[train_indices], X.iloc[test_indices], y.iloc[test_indices]
print("Training dataset size: ", X_train.shape)
print("Testing dataset size: ", X_test.shape)

# Standardization
def standardize(col):
    return np.mean(col, axis=0), np.std(col, axis=0)

X_scaler_mean, X_scaler_std = standardize(X_train)
y_scaler_mean, y_scaler_std = standardize(y_train)

X_train_scaled = (X_train - X_scaler_mean)/X_scaler_std
X_test_scaled = (X_test - X_scaler_mean)/X_scaler_std

# modeling
class MultipleLinearRegression():
    def fit(self, X, y):
        X = np.insert(X, 0, 1, axis=1)
        self.coefficients = np.linalg.inv(X.T.dot(X)).dot(X.T).dot(y)

    def predict(self, X):
        X = np.insert(X, 0, 1, axis=1)
        return np.dot(X, self.coefficients)
    
model = MultipleLinearRegression()
model.fit(X_train_scaled, y_train)
y_pred = model.predict(X_test_scaled)

# Evaluate
def mean_squared_error(y, y_pred):
    return np.sum((y - y_pred)**2)/len(y)

def r_score(y, y_pred):
    tss = np.sum((y - np.mean(y))**2)
    rss = np.sum((y - y_pred)**2)
    return 1 - rss/tss

mse = mean_squared_error(y_test, y_pred)
r2 = r_score(y_test, y_pred)

print("Mean Squared Error: ", mse)
print("R square: ", r2)

Training dataset size:  (35, 8)
Testing dataset size:  (9, 8)
Mean Squared Error:  25.672434974227812
R square:  -0.39561559255869305
