In [50]:
import numpy as np
import pandas as pd
from sklearn.linear_model import LinearRegression, Lasso
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import mean_squared_error
from sklearn import tree

In [51]:
data_raw = pd.read_csv('./csv/Hitters.csv')
data_raw.shape

(322, 20)

In [52]:
data_complete = data_raw.dropna()

In [53]:
data_complete.shape

(263, 20)

In [54]:
data_complete.head()

Unnamed: 0,AtBat,Hits,HmRun,Runs,RBI,Walks,Years,CAtBat,CHits,CHmRun,CRuns,CRBI,CWalks,League,Division,PutOuts,Assists,Errors,Salary,NewLeague
1,315,81,7,24,38,39,14,3449,835,69,321,414,375,N,W,632,43,10,475.0,N
2,479,130,18,66,72,76,3,1624,457,63,224,266,263,A,W,880,82,14,480.0,A
3,496,141,20,65,78,37,11,5628,1575,225,828,838,354,N,E,200,11,3,500.0,N
4,321,87,10,39,42,30,2,396,101,12,48,46,33,N,E,805,40,4,91.5,N
5,594,169,4,74,51,35,11,4408,1133,19,501,336,194,A,W,282,421,25,750.0,A


In [55]:
data_columns = ['AtBat','Hits','HmRun','Runs','RBI','Walks','Years','CAtBat','CHits','CHmRun','CRuns','CRBI','CWalks','PutOuts','Assists','Errors','Salary']

In [56]:
data = data_complete.loc[:,data_columns]

In [57]:
data.shape

(263, 17)

In [58]:
X=data.drop('Salary', axis=1)
y=np.log(data['Salary'])

In [59]:
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=127)

In [60]:
scaler = StandardScaler()
X_train_scl = scaler.fit_transform(X_train)

In [61]:
X_test_scl = scaler.transform(X_test)

In [62]:
model_1 = LinearRegression()
fit_1 = model_1.fit(X_train_scl, y_train)

In [63]:
predict_1 = fit_1.predict(X_test_scl)
mse_1 = mean_squared_error(y_test, predict_1)
print('MSE of Linear Regression:', mse_1)

MSE of Linear Regression: 0.41435786712109124


In [64]:
model_2 = Lasso(alpha=0.5)
fit_2 = model_2.fit(X_train_scl, y_train)
predict_2 = fit_2.predict(X_test_scl)
mse_2 = mean_squared_error(y_test, predict_2)
print('MSE of Lasso Regression:', mse_2)

MSE of Lasso Regression: 0.7173814077882883


In [66]:
model_3 = tree.DecisionTreeRegressor(random_state=127)
fit_3 = model_3.fit(X_train_scl, y_train)
predict_3 = fit_3.predict(X_test_scl)
mse_3 = mean_squared_error(y_test, predict_3)
print('MSE of Decision Tree Regression:', mse_3)

MSE of Decision Tree Regression: 0.4821064720996552


In [67]:
def predict_ensamble(X,model_1, model_2, model_3):
    pred_1 = model_1.predict(X)
    pred_2 = model_2.predict(X)
    pred_3 = model_3.predict(X)
    return (pred_1 + pred_2 + pred_3) / 3

In [68]:
y_pred_ensamble = predict_ensamble(X_test_scl, fit_1, fit_2, fit_3)
mse_ensamble = mean_squared_error(y_test, y_pred_ensamble)
print('MSE of Ensemble Model:', mse_ensamble)

MSE of Ensemble Model: 0.37699744379290095
