<a href="https://colab.research.google.com/github/VDai1999/ConcertTicketSale/blob/main/Regression_analyses.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

## 1. Load dataset and prepare data

In [None]:
#Import packages
import pandas as pd
import numpy as np
from sklearn import metrics
from sklearn.model_selection import LeaveOneOut
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.ensemble import RandomForestRegressor
from sklearn.linear_model import Ridge
from sklearn.linear_model import Lasso
from sklearn.metrics import mean_squared_error
from sklearn import neighbors
from sklearn.tree import DecisionTreeRegressor
from sklearn.metrics import make_scorer,r2_score
from sklearn.metrics import mean_squared_error

In [None]:
#Load data
data= pd.read_csv("train_valid.csv")
test = pd.read_csv("test.csv")

In [None]:
data.shape

In [None]:
# select predictors
X = data.loc[:,'View':'cou_score']
X = X.drop(['type'], axis=1) # Drop type variable since it is not different among records

In [None]:
X.head()

In [None]:
# select target
y=data[['Millions']]
y.head()

## 2. Feature Selection

Correlatiion Between Variables

In [None]:
#Using Pearson Correlation
plt.figure(figsize=(12,10))
cor = data.corr()
sns.heatmap(cor, annot=True, cmap=plt.cm.Reds)
plt.show()

Selecting highly correlated features

In [None]:
#Correlation with output variable
cor_target = abs(cor["Millions"])
relevant_features = cor_target[cor_target>0.1]
relevant_features

Forward Selection

In [None]:
from mlxtend.feature_selection import SequentialFeatureSelector as SFS
from sklearn.linear_model import LinearRegression

sfs = SFS(LinearRegression(),
           k_features=11,
           forward=True,
           floating=False,
           scoring = 'r2',
           cv = 10)

sfs.fit(X, y)
sfs.k_feature_names_

Backward selection

In [None]:
sbs = SFS(LinearRegression(), 
          k_features=11, 
          forward=False, 
          floating=False,
          scoring = 'r2',
          cv=10)
sbs.fit(X, y)
sbs.k_feature_names_

Select predictors variables after doing feature selection

In [None]:
X = X[['View', 'Dislike', 'danceability', 'energy', 'acousticness', 'instrumentalness', 'liveness', 
       'duration_ms', 'neg_score', 'pos_score', 'cou_score']]

In [None]:
X.head()

In [None]:
X_array = np.array(X)
y_array = np.array(y)

In [None]:
loo = LeaveOneOut()

def model_use_LOOCV(md):
    ytests = []
    ypreds = []
    for train_idx, test_idx in loo.split(X):
        X_train, X_test = X_array[train_idx], X_array[test_idx] #requires arrays
        y_train, y_test = y_array[train_idx], y_array[test_idx]
    
        model = md
        model.fit(X = X_train, y = y_train.ravel()) 
        y_pred = model.predict(X_test)
                
        ytests += list(y_test)
        ypreds += list(y_pred)
    
    ms_error = metrics.mean_squared_error(ytests, ypreds, squared=False)
        
    print("RMSE: {:.5f}".format(ms_error))

## 3. Decision trees using LOOCV

In [None]:
# initialize the dtree algorithm
dtree1=DecisionTreeRegressor(random_state=50)
# use friedman_mse criterion for the change
dtree2=DecisionTreeRegressor(random_state=50, criterion ='friedman_mse')
# change the depth of tree
dtree3=DecisionTreeRegressor(random_state=50, max_depth = 8)

## 4. Random Forest Using LOOCV

In [None]:
# create a random forest with 100 or 50 or 10  trees
rforest1=RandomForestRegressor(random_state=50,n_estimators=10)
rforest2=RandomForestRegressor(random_state=50,n_estimators=50)
rforest3=RandomForestRegressor(random_state=50,n_estimators=100)

# create a random forest with 10, 100 trees with max_depth of 8
rforest4=RandomForestRegressor(random_state=50,n_estimators=10, max_depth = 5)
rforest5=RandomForestRegressor(random_state=50,n_estimators=100, max_depth = 8)

## 5. K-Nearest Neighbor Model

In [None]:
# k-nearest neighbors model 
knn1 = neighbors.KNeighborsRegressor(n_neighbors = 45)

## 4. Lasso Regression Using LOOCV

In [None]:
lasso1 = Lasso()
lasso2 = Lasso(alpha=0.3, normalize=True, max_iter=30)
lasso3 = Lasso(alpha=0.7, normalize=True, max_iter=50)

## 6. Ridge Regression Uing LOOCV

In [None]:
# ridge regression models
rr1 = Ridge(normalize=True)
rr2 = Ridge(alpha = 0.7,normalize=True)

## 7. Compare Models 

In [None]:
print("Decision Tree Model 1: ")
model_use_LOOCV(dtree1)
print("="*80)
print("Decision Tree Model 2: ")
model_use_LOOCV(dtree2)
print("="*80)
print("Decision Tree Model 3: ")
model_use_LOOCV(dtree3)
print("="*80)
print("Random Forest Model 1: ")
model_use_LOOCV(rforest1)
print("="*80)
print("Random Forest Model 2: ")
model_use_LOOCV(rforest2)
print("="*80)
print("Random Forest Model 3: ")
model_use_LOOCV(rforest3)
print("="*80)
print("Random Forest Model 4: ")
model_use_LOOCV(rforest4)
print("="*80)
print("Random Forest Model 5: ")
model_use_LOOCV(rforest5)
print("="*80)
print("K-Nearest Negibor with K = 45: ")
model_use_LOOCV(knn1)
print("="*80)
print("Lasso Regression Model 1: ")
model_use_LOOCV(lasso1)
print("="*80)
print("Lasso Regression Model 2: ")
model_use_LOOCV(lasso2)
print("="*80)
print("Lasso Regression Model 3: ")
model_use_LOOCV(lasso3)
print("="*80)
print("Ridge Regression Model 1: ")
model_use_LOOCV(rr1)
print("="*80)
print("Ridge Regression Model 2: ")
model_use_LOOCV(rr2)

- **As Ridge Regression 1 has the smallest RMSE of 114.12892, so we will use that as the model to predict the concert sale in 2020 for artists (in test data).**

In [None]:
model = rr1

## 8. Use the final to predict the sale in 2020

In [None]:
test.head()

In [None]:
X_pred = test[['View', 'Dislike', 'danceability', 'energy', 'acousticness', 'instrumentalness', 'liveness', 
               'duration_ms', 'neg_score', 'pos_score', 'cou_score']]
#X_pred = pd.get_dummies(X_pred,drop_first=True)

In [None]:
model.fit(X, y)  #fit the model
pred = model.predict(X_pred) #make prediction on test set
# assign the columns to test data 
test["Millions"] = pred
# sort the test data predictions by sale 
test = test.sort_values(by = 'Millions', ascending = False).reset_index(drop = True)
# keep only year and artist, sale
test = test[["Year","Artist", "Millions"]]
test

In [None]:
# save as a resulting csv file 
test.to_csv("test_results.csv", index=False)

In [None]:
#import matplotlib.pyplot as plt
df = pd.concat([data,test], ignore_index=True)
df = df[["Year","Artist", "Millions"]]
df = df.drop_duplicates()

In [None]:
# example plot of Ed Sheeran sale (because of the views counted in half of year 2020)
df = df[df["Artist"]=="Ed Sheeran"]
plt.bar(df['Year'], df['Millions'])