In [54]:
# conventional way to import pandas
import pandas as pd
import numpy as np

In [55]:
# read CSV file from the 'data' subdirectory using a relative path
cols=['sepal length','sepal width','petal length','petal width','class']
df = pd.read_csv('https://archive.ics.uci.edu/ml/machine-learning-databases/iris/iris.data', header=None,names=cols)
# display the first 5 rows

# One Hot Encoding on Nominal Variable Class

In [56]:
# One Hot Encoding
one_hot_encoded=pd.get_dummies(df)

In [59]:
# create a Python list of feature names
feature_cols = ['sepal width','petal length','petal width','class_Iris-setosa', 'class_Iris-versicolor', 'class_Iris-virginica']

# use the list to select a subset of the original DataFrame
X = one_hot_encoded[feature_cols]

# print the first 5 rows
X.head()

Unnamed: 0,sepal width,petal length,petal width,class_Iris-setosa,class_Iris-versicolor,class_Iris-virginica
0,3.5,1.4,0.2,1,0,0
1,3.0,1.4,0.2,1,0,0
2,3.2,1.3,0.2,1,0,0
3,3.1,1.5,0.2,1,0,0
4,3.6,1.4,0.2,1,0,0


In [60]:
print(X.shape)

(150, 6)


In [61]:
# select a Series from the DataFrame
Y = one_hot_encoded['sepal length']

# print the first 5 values
Y.head()

0    5.1
1    4.9
2    4.7
3    4.6
4    5.0
Name: sepal length, dtype: float64

# Splitting X and Y into training and testing sets

In [62]:

from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, Y, random_state=1)

In [63]:
# default split is 75% for training and 25% for testing
print(X_train.shape)
print(y_train.shape)
print(X_test.shape)
print(y_test.shape)

(112, 6)
(112,)
(38, 6)
(38,)


# Linear regression in scikit-learn


In [64]:
# import model
from sklearn.linear_model import LinearRegression

# instantiate
linreg = LinearRegression()

# fit the model to the training data (learn the coefficients)
linreg.fit(X_train, y_train)

LinearRegression(copy_X=True, fit_intercept=True, n_jobs=1, normalize=False)

In [65]:
# print the intercept and coefficients
print(linreg.intercept_)
print(linreg.coef_)

1.5209609366553503
[ 0.44213063  0.91426633 -0.39683311  0.69485303 -0.12231585 -0.57253718]


In [66]:
# pair the feature names with the coefficients
list(zip(feature_cols, linreg.coef_))

[('sepal width', 0.44213063296299643),
 ('petal length', 0.9142663349628987),
 ('petal width', -0.396833105514812),
 ('class_Iris-setosa', 0.6948530305858115),
 ('class_Iris-versicolor', -0.12231585120522798),
 ('class_Iris-virginica', -0.5725371793805825)]

# Making Predictions on Test Set

In [67]:
# make predictions on the testing set
y_pred = linreg.predict(X_test)

# Computing the RMSE

In [68]:
from sklearn import metrics

print(np.sqrt(metrics.mean_squared_error(y_test, y_pred)))

0.38125987757901997


# Try out Regression on other features

Predict Sepal Width

In [69]:
# create a Python list of feature names
feature_cols = ['sepal length','petal length','petal width','class_Iris-setosa', 'class_Iris-versicolor', 'class_Iris-virginica']

# use the list to select a subset of the original DataFrame
X = one_hot_encoded[feature_cols]

# select a Series from the DataFrame
y = one_hot_encoded['sepal width']

# split into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=1)

# fit the model to the training data (learn the coefficients)
linreg.fit(X_train, y_train)

# make predictions on the testing set
y_pred = linreg.predict(X_test)

# compute the RMSE of our predictions
print(np.sqrt(metrics.mean_squared_error(y_test, y_pred)))

0.2765032068041506


# Predict petal length

In [70]:
# create a Python list of feature names
feature_cols = ['sepal length','sepal width','petal width','class_Iris-setosa', 'class_Iris-versicolor', 'class_Iris-virginica']

# use the list to select a subset of the original DataFrame
X = one_hot_encoded[feature_cols]

# select a Series from the DataFrame
y = one_hot_encoded['petal length']

# split into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=1)

# fit the model to the training data (learn the coefficients)
linreg.fit(X_train, y_train)

# make predictions on the testing set
y_pred = linreg.predict(X_test)

# compute the RMSE of our predictions
print(np.sqrt(metrics.mean_squared_error(y_test, y_pred)))

0.32673903226161666


# Predict petal width

In [73]:
# create a Python list of feature names
feature_cols = ['sepal length','sepal width','petal length','class_Iris-setosa', 'class_Iris-versicolor', 'class_Iris-virginica']

# use the list to select a subset of the original DataFrame
X = one_hot_encoded[feature_cols]

# select a Series from the DataFrame
y = one_hot_encoded['petal width']

# split into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=1)

# fit the model to the training data (learn the coefficients)
linreg.fit(X_train, y_train)

# make predictions on the testing set
y_pred = linreg.predict(X_test)

# compute the RMSE of our predictions
print(np.sqrt(metrics.mean_squared_error(y_test, y_pred)))

0.1701794851201285


# Decision Tree Regressor: Fit a new regression model to the training set 

In [74]:
from sklearn.tree import DecisionTreeRegressor

X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size=0.33, random_state=324)

regressor = DecisionTreeRegressor(max_depth=20)
regressor.fit(X_train, y_train)

DecisionTreeRegressor(criterion='mse', max_depth=20, max_features=None,
           max_leaf_nodes=None, min_impurity_decrease=0.0,
           min_impurity_split=None, min_samples_leaf=1,
           min_samples_split=2, min_weight_fraction_leaf=0.0,
           presort=False, random_state=None, splitter='best')

# Perform prediction on test

In [75]:
y_prediction = regressor.predict(X_test)

In [76]:
print(np.sqrt(metrics.mean_squared_error(y_test, y_prediction)))

0.07211102550927963


In [79]:
import statsmodels.api as sm

# Note the difference in argument order
model = sm.OLS(Y, X).fit()
predictions = model.predict(X) # make the predictions by the model

# Print out the statistics
model.summary()

0,1,2,3
Dep. Variable:,sepal length,R-squared:,1.0
Model:,OLS,Adj. R-squared:,1.0
Method:,Least Squares,F-statistic:,4.054e+31
Date:,"Thu, 22 Nov 2018",Prob (F-statistic):,0.0
Time:,19:22:16,Log-Likelihood:,5022.4
No. Observations:,150,AIC:,-10030.0
Df Residuals:,144,BIC:,-10010.0
Df Model:,5,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
sepal length,1.0000,1.9e-16,5.26e+15,0.000,1.000,1.000
sepal width,5.551e-16,2.03e-16,2.732,0.007,1.53e-16,9.57e-16
petal length,-1.11e-16,2.08e-16,-0.534,0.594,-5.22e-16,3e-16
class_Iris-setosa,-2.22e-15,7.5e-16,-2.961,0.004,-3.7e-15,-7.38e-16
class_Iris-versicolor,-1.443e-15,7.06e-16,-2.046,0.043,-2.84e-15,-4.86e-17
class_Iris-virginica,-1.776e-15,8.23e-16,-2.160,0.032,-3.4e-15,-1.51e-16

0,1,2,3
Omnibus:,143.654,Durbin-Watson:,0.652
Prob(Omnibus):,0.0,Jarque-Bera (JB):,11.365
Skew:,0.064,Prob(JB):,0.00341
Kurtosis:,1.658,Cond. No.,163.0


In [80]:
from sklearn import linear_model

lm = linear_model.LinearRegression()
model = lm.fit(X,Y)

predictions = lm.predict(X)
print('Accuracy of model=',lm.score(X,Y))

Accuracy of model= 1.0


In [81]:

## R2 score Evaluation
y_train_pred=model.predict(X_train)
y_test_pred=model.predict(X_test)

from sklearn.metrics import r2_score

print("r2 train: %.3f, test : %.3f" %(r2_score(y_train,y_train_pred),r2_score(y_test,y_test_pred) ))

r2 train: 1.000, test : 1.000


# Other Regression Models

In [86]:
## Reference https://www.kaggle.com/serigne/stacked-regressions-top-4-on-leaderboard

from sklearn.linear_model import ElasticNet, Lasso,  BayesianRidge, LassoLarsIC
from sklearn.ensemble import RandomForestRegressor,  GradientBoostingRegressor
from sklearn.kernel_ridge import KernelRidge
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import RobustScaler
from sklearn.base import BaseEstimator, TransformerMixin, RegressorMixin, clone
from sklearn.model_selection import KFold, cross_val_score, train_test_split
from sklearn.metrics import mean_squared_error
import xgboost as xgb

In [87]:
#Validation function
n_folds = 5

def rmsle_cv(model):
    kf = KFold(n_folds, shuffle=True, random_state=42).get_n_splits(X.values)
    rmse= np.sqrt(-cross_val_score(model, X.values, Y, scoring="neg_mean_squared_error", cv = kf))
    return(rmse)

In [88]:
lasso = make_pipeline(RobustScaler(), Lasso(alpha =0.0005, random_state=1))


In [89]:
ENet = make_pipeline(RobustScaler(), ElasticNet(alpha=0.0005, l1_ratio=.9, random_state=3))


In [90]:
KRR = KernelRidge(alpha=0.6, kernel='polynomial', degree=2, coef0=2.5)


In [91]:

GBoost = GradientBoostingRegressor(n_estimators=3000, learning_rate=0.05,
                                   max_depth=4, max_features='sqrt',
                                   min_samples_leaf=15, min_samples_split=10, 
                                   loss='huber', random_state =5)

In [92]:
model_xgb = xgb.XGBRegressor(colsample_bytree=0.4603, gamma=0.0468, 
                             learning_rate=0.05, max_depth=3, 
                             min_child_weight=1.7817, n_estimators=2200,
                             reg_alpha=0.4640, reg_lambda=0.8571,
                             subsample=0.5213, silent=1,
                             random_state =7, nthread = -1)

# Model Evaluation Scores¶


In [93]:
score = rmsle_cv(lasso)
print("\nLasso score: {:.4f} ({:.4f})\n".format(score.mean(), score.std()))


Lasso score: 0.0008 (0.0002)



In [94]:

score = rmsle_cv(ENet)
print("ElasticNet score: {:.4f} ({:.4f})\n".format(score.mean(), score.std()))

ElasticNet score: 0.0007 (0.0002)



In [95]:
score = rmsle_cv(KRR)
print("Kernel Ridge score: {:.4f} ({:.4f})\n".format(score.mean(), score.std()))

Kernel Ridge score: 0.0173 (0.0031)



In [96]:
score = rmsle_cv(GBoost)
print("Gradient Boosting score: {:.4f} ({:.4f})\n".format(score.mean(), score.std()))

Gradient Boosting score: 0.1893 (0.0619)



In [97]:
score = rmsle_cv(model_xgb)
print("Xgboost score: {:.4f} ({:.4f})\n".format(score.mean(), score.std()))

Xgboost score: 0.1763 (0.0174)

