In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import BaggingRegressor
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error

In [2]:
data_raw = pd.read_csv('./csv/Hitters.csv')
data_complete = data_raw.dropna()
print(data_raw.shape)
print(data_complete.shape)

(322, 20)
(263, 20)


In [3]:
data_columns = ['AtBat','Hits','HmRun','Runs','RBI','Walks','Years','CAtBat','CHits','CHmRun','CRuns','CRBI','CWalks','PutOuts','Assists','Errors','Salary']

data = data_complete.loc[:,data_columns]
print(data.shape)

data.head()

(263, 17)


Unnamed: 0,AtBat,Hits,HmRun,Runs,RBI,Walks,Years,CAtBat,CHits,CHmRun,CRuns,CRBI,CWalks,PutOuts,Assists,Errors,Salary
1,315,81,7,24,38,39,14,3449,835,69,321,414,375,632,43,10,475.0
2,479,130,18,66,72,76,3,1624,457,63,224,266,263,880,82,14,480.0
3,496,141,20,65,78,37,11,5628,1575,225,828,838,354,200,11,3,500.0
4,321,87,10,39,42,30,2,396,101,12,48,46,33,805,40,4,91.5
5,594,169,4,74,51,35,11,4408,1133,19,501,336,194,282,421,25,750.0


In [4]:
X=data.drop('Salary', axis=1)
print(X.shape)

y=np.log(data['Salary'])
print(y.shape)
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=127)

(263, 16)
(263,)


In [5]:
scaler = StandardScaler()
X_train_scl = scaler.fit_transform(X_train)
X_test_scl = scaler.transform(X_test)

In [6]:
base_regressor = LinearRegression()

In [7]:
fit_base = base_regressor.fit(X_train_scl, y_train)
predict_base=fit_base.predict(X_test_scl)
performance_base = mean_squared_error(y_test, predict_base)
print('Base model performance: ', performance_base)

Base model performance:  0.41435786712109124


In [9]:
bag_linreg = BaggingRegressor(estimator=base_regressor, n_estimators=1000,max_samples=0.3, bootstrap=True, max_features=2, bootstrap_features=False,n_jobs=-1, random_state=127)

In [10]:
bag_linreg.fit(X_train_scl, y_train)

In [11]:
prediction = bag_linreg.predict(X_test_scl)
performance_bag = mean_squared_error(y_test, prediction)
print('Bagging model performance: ', performance_bag)

Bagging model performance:  0.3637498934788416


In [12]:
from sklearn.ensemble import RandomForestRegressor
random_forest = RandomForestRegressor(n_estimators=1000, criterion='squared_error',max_depth=4, max_samples=0.3, bootstrap=True, n_jobs=-1, random_state=127)

In [13]:
random_forest.fit(X_train_scl, y_train)

In [14]:
prediction = random_forest.predict(X_test_scl)
performance_rf = mean_squared_error(y_test, prediction)
print('Random Forest model performance: ', performance_rf)

Random Forest model performance:  0.2054851127465172


In [15]:
from sklearn.ensemble import ExtraTreesRegressor

In [16]:
extra_trees = ExtraTreesRegressor(n_estimators=1000, criterion='squared_error', max_depth=5, max_samples=0.3, bootstrap=True, n_jobs=-1, random_state=171)

In [17]:
extra_trees.fit(X_train_scl, y_train)

In [18]:
prediction = extra_trees.predict(X_test_scl)
performance_et = mean_squared_error(y_test, prediction)
print('Extra Trees model performance: ', performance_et)

Extra Trees model performance:  0.20699218407167783


### ADA boost

In [None]:
base_regressor = LinearRegression()


In [19]:
from sklearn.ensemble import AdaBoostRegressor

In [21]:
boost_linreg = AdaBoostRegressor(estimator=base_regressor, n_estimators=200, learning_rate=0.8, loss='linear', random_state=127)

In [22]:
boost_linreg.fit(X_train_scl, y_train)

In [23]:
prediction = boost_linreg.predict(X_test_scl)
performance_boost = mean_squared_error(y_test, prediction)
print('Boosting model performance: ', performance_boost)

Boosting model performance:  0.4388774115448971


In [24]:
from sklearn.ensemble import GradientBoostingRegressor

In [25]:
gb_reg= GradientBoostingRegressor(loss='squared_error', n_estimators=200, learning_rate=0.5, subsample=1, criterion='squared_error', max_depth=4, random_state=127)

In [26]:
gb_reg.fit(X_train_scl, y_train)

In [27]:
prediction = gb_reg.predict(X_test_scl)
performance_gb = mean_squared_error(y_test, prediction)
print('Gradient Boosting model performance: ', performance_gb)

Gradient Boosting model performance:  0.26248685500786834
