In [1]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd

In [2]:
dataset = pd.read_csv('Data_RM.csv')
X = dataset.iloc[:, :-1].values
y = dataset.iloc[:, -1].values

# Decision Tree Regression Model

In [3]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 0)

In [4]:
from sklearn.tree import DecisionTreeRegressor
regressor = DecisionTreeRegressor(random_state = 0)
regressor.fit(X_train, y_train)

In [5]:
y_pred = regressor.predict(X_test)
np.set_printoptions(precision=2)
print(np.concatenate((y_pred.reshape(len(y_pred),1), y_test.reshape(len(y_test),1)),1))

[[431.28 431.23]
 [459.59 460.01]
 [460.06 461.14]
 ...
 [471.46 473.26]
 [437.76 438.  ]
 [462.74 463.28]]


In [6]:
from sklearn.metrics import r2_score
r2_score(y_test, y_pred)

0.922905874177941

# Multiple Linear Regression

In [7]:
from sklearn.linear_model import LinearRegression
m_regressor = LinearRegression()
m_regressor.fit(X_train, y_train)

In [8]:
y_pred_m = m_regressor.predict(X_test)
np.set_printoptions(precision=2)
print(np.concatenate((y_pred_m.reshape(len(y_pred),1), y_test.reshape(len(y_test),1)),1))

[[431.43 431.23]
 [458.56 460.01]
 [462.75 461.14]
 ...
 [469.52 473.26]
 [442.42 438.  ]
 [461.88 463.28]]


In [9]:
from sklearn.metrics import r2_score
r2_score(y_test, y_pred_m)

0.9325315554761303

# Polynomial Regression

In [10]:
from sklearn.preprocessing import PolynomialFeatures
from sklearn.linear_model import LinearRegression
poly_reg = PolynomialFeatures(degree = 4)
X_poly = poly_reg.fit_transform(X_train)
p_regressor = LinearRegression()
p_regressor.fit(X_poly, y_train)

In [11]:
y_pred_p = p_regressor.predict(poly_reg.transform(X_test))
np.set_printoptions(precision=2)
print(np.concatenate((y_pred_p.reshape(len(y_pred_p),1), y_test.reshape(len(y_test),1)),1))

[[433.94 431.23]
 [457.9  460.01]
 [460.52 461.14]
 ...
 [469.53 473.26]
 [438.27 438.  ]
 [461.66 463.28]]


In [12]:
from sklearn.metrics import r2_score
r2_score(y_test, y_pred_p)

0.9458193585520964

# Random Forest Regression

In [13]:
from sklearn.ensemble import RandomForestRegressor
rf_regressor = RandomForestRegressor(n_estimators = 10, random_state = 0)
rf_regressor.fit(X_train, y_train)

In [14]:
y_pred_rf = rf_regressor.predict(X_test)
np.set_printoptions(precision=2)
print(np.concatenate((y_pred_rf.reshape(len(y_pred_rf),1), y_test.reshape(len(y_test),1)),1))

[[434.05 431.23]
 [458.79 460.01]
 [463.02 461.14]
 ...
 [469.48 473.26]
 [439.57 438.  ]
 [460.38 463.28]]


In [15]:
from sklearn.metrics import r2_score
r2_score(y_test, y_pred_rf)

0.9615908334363876

# Support Vector Regression (SVR)

In [21]:
dataset = pd.read_csv('Data_RM.csv')
X = dataset.iloc[:, :-1].values
y1 = dataset.iloc[:, -1].values

In [22]:
y1 = y1.reshape(len(y1),1)

In [23]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y1, test_size = 0.2, random_state = 0)

In [24]:
print(X_train)

[[  11.22   43.13 1017.24   80.9 ]
 [  13.67   54.3  1015.92   75.42]
 [  32.84   77.95 1014.68   45.8 ]
 ...
 [  16.81   38.52 1018.26   75.21]
 [  12.8    41.16 1022.43   86.19]
 [  32.32   67.9  1006.08   37.93]]


In [25]:
from sklearn.preprocessing import StandardScaler
sc_X = StandardScaler()
sc_y = StandardScaler()
X_train = sc_X.fit_transform(X_train)
y_train = sc_y.fit_transform(y_train)

In [26]:
from sklearn.svm import SVR
s_regressor = SVR(kernel = 'rbf')
s_regressor.fit(X_train, y_train)

  y = column_or_1d(y, warn=True)


In [27]:
y_pred_s = sc_y.inverse_transform(s_regressor.predict(sc_X.transform(X_test)).reshape(-1,1))
np.set_printoptions(precision=2)
print(np.concatenate((y_pred_s.reshape(len(y_pred_s),1), y_test.reshape(len(y_test),1)),1))

[[434.05 431.23]
 [457.94 460.01]
 [461.03 461.14]
 ...
 [470.6  473.26]
 [439.42 438.  ]
 [460.92 463.28]]


In [28]:
from sklearn.metrics import r2_score
r2_score(y_test, y_pred_s)

0.948078404998626

# Regression Model Pros and Cons

| Regression Model | Pros | Cons |
| --- | --- | --- |
|Linear Regression | Works on any size of dataset, gives info abt relevance of features | Linear Regression Assumptions |
| Polynomial Regression | Works on any size of dataset, work very well on non linear problems | Need to choose the right ploynomial degree for a good bias/variance tradeoff |
| SVR | Easily adaptable, works very well on non linear problems, not baised by outliers | Compulsory to apply feature scaling, not well known, more difficult to understand |
| Decision Tree Regression | Interpretability, no need for feature scaling, works on both linear problems and non linear problems | Poor results on too small datasets, overfitting can easily occur |
| Random Forest Regression | Powerful and accurate, good performance on many problems, including non linear problems | No interpretability, overfitting cam easily occur, need to choose the number of trees |