In [54]:
# conventional way to import pandas
import pandas as pd
import numpy as np

In [55]:
# read CSV file from the 'data' subdirectory using a relative path
cols=['sepal length','sepal width','petal length','petal width','class']
df = pd.read_csv('https://archive.ics.uci.edu/ml/machine-learning-databases/iris/iris.data', header=None,names=cols)
# display the first 5 rows

# One Hot Encoding on Nominal Variable Class

In [56]:
# One Hot Encoding
one_hot_encoded=pd.get_dummies(df)

In [59]:
# create a Python list of feature names
feature_cols = ['sepal width','petal length','petal width','class_Iris-setosa', 'class_Iris-versicolor', 'class_Iris-virginica']

# use the list to select a subset of the original DataFrame
X = one_hot_encoded[feature_cols]

# print the first 5 rows
X.head()

Unnamed: 0,sepal width,petal length,petal width,class_Iris-setosa,class_Iris-versicolor,class_Iris-virginica
0,3.5,1.4,0.2,1,0,0
1,3.0,1.4,0.2,1,0,0
2,3.2,1.3,0.2,1,0,0
3,3.1,1.5,0.2,1,0,0
4,3.6,1.4,0.2,1,0,0


In [60]:
print(X.shape)

(150, 6)


In [61]:
# select a Series from the DataFrame
Y = one_hot_encoded['sepal length']

# print the first 5 values
Y.head()

0    5.1
1    4.9
2    4.7
3    4.6
4    5.0
Name: sepal length, dtype: float64

# Splitting X and Y into training and testing sets

In [62]:

from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, Y, random_state=1)

In [63]:
# default split is 75% for training and 25% for testing
print(X_train.shape)
print(y_train.shape)
print(X_test.shape)
print(y_test.shape)

(112, 6)
(112,)
(38, 6)
(38,)


# Linear regression in scikit-learn


In [64]:
# import model
from sklearn.linear_model import LinearRegression

# instantiate
linreg = LinearRegression()

# fit the model to the training data (learn the coefficients)
linreg.fit(X_train, y_train)

LinearRegression(copy_X=True, fit_intercept=True, n_jobs=1, normalize=False)

In [65]:
# print the intercept and coefficients
print(linreg.intercept_)
print(linreg.coef_)

1.5209609366553503
[ 0.44213063  0.91426633 -0.39683311  0.69485303 -0.12231585 -0.57253718]


In [66]:
# pair the feature names with the coefficients
list(zip(feature_cols, linreg.coef_))

[('sepal width', 0.44213063296299643),
 ('petal length', 0.9142663349628987),
 ('petal width', -0.396833105514812),
 ('class_Iris-setosa', 0.6948530305858115),
 ('class_Iris-versicolor', -0.12231585120522798),
 ('class_Iris-virginica', -0.5725371793805825)]

# Making Predictions on Test Set

In [67]:
# make predictions on the testing set
y_pred = linreg.predict(X_test)

# Computing the RMSE

In [68]:
from sklearn import metrics

print(np.sqrt(metrics.mean_squared_error(y_test, y_pred)))

0.38125987757901997


# Try out Regression on other features

Predict Sepal Width

In [69]:
# create a Python list of feature names
feature_cols = ['sepal length','petal length','petal width','class_Iris-setosa', 'class_Iris-versicolor', 'class_Iris-virginica']

# use the list to select a subset of the original DataFrame
X = one_hot_encoded[feature_cols]

# select a Series from the DataFrame
y = one_hot_encoded['sepal width']

# split into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=1)

# fit the model to the training data (learn the coefficients)
linreg.fit(X_train, y_train)

# make predictions on the testing set
y_pred = linreg.predict(X_test)

# compute the RMSE of our predictions
print(np.sqrt(metrics.mean_squared_error(y_test, y_pred)))

0.2765032068041506


# Predict petal length

In [70]:
# create a Python list of feature names
feature_cols = ['sepal length','sepal width','petal width','class_Iris-setosa', 'class_Iris-versicolor', 'class_Iris-virginica']

# use the list to select a subset of the original DataFrame
X = one_hot_encoded[feature_cols]

# select a Series from the DataFrame
y = one_hot_encoded['petal length']

# split into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=1)

# fit the model to the training data (learn the coefficients)
linreg.fit(X_train, y_train)

# make predictions on the testing set
y_pred = linreg.predict(X_test)

# compute the RMSE of our predictions
print(np.sqrt(metrics.mean_squared_error(y_test, y_pred)))

0.32673903226161666


# Predict petal width

In [73]:
# create a Python list of feature names
feature_cols = ['sepal length','sepal width','petal length','class_Iris-setosa', 'class_Iris-versicolor', 'class_Iris-virginica']

# use the list to select a subset of the original DataFrame
X = one_hot_encoded[feature_cols]

# select a Series from the DataFrame
y = one_hot_encoded['petal width']

# split into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=1)

# fit the model to the training data (learn the coefficients)
linreg.fit(X_train, y_train)

# make predictions on the testing set
y_pred = linreg.predict(X_test)

# compute the RMSE of our predictions
print(np.sqrt(metrics.mean_squared_error(y_test, y_pred)))

0.1701794851201285


# Decision Tree Regressor: Fit a new regression model to the training set 

In [74]:
from sklearn.tree import DecisionTreeRegressor

X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size=0.33, random_state=324)

regressor = DecisionTreeRegressor(max_depth=20)
regressor.fit(X_train, y_train)

DecisionTreeRegressor(criterion='mse', max_depth=20, max_features=None,
           max_leaf_nodes=None, min_impurity_decrease=0.0,
           min_impurity_split=None, min_samples_leaf=1,
           min_samples_split=2, min_weight_fraction_leaf=0.0,
           presort=False, random_state=None, splitter='best')

# Perform prediction on test

In [75]:
y_prediction = regressor.predict(X_test)

In [76]:
print(np.sqrt(metrics.mean_squared_error(y_test, y_prediction)))

0.07211102550927963
