In [44]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

In [45]:
dataset = pd.read_csv("50_Startups.csv")
print("done")
print(dataset.head(5))

X = dataset.iloc[: , :-1].values
Y = dataset.iloc[:, -1].values

done
   R&D Spend  Administration  Marketing Spend       State     Profit
0  165349.20       136897.80        471784.10    New York  192261.83
1  162597.70       151377.59        443898.53  California  191792.06
2  153441.51       101145.55        407934.54     Florida  191050.39
3  144372.41       118671.85        383199.62    New York  182901.99
4  142107.34        91391.77        366168.42     Florida  166187.94


In [46]:
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer

ct = ColumnTransformer(transformers=[("encoder", OneHotEncoder(), [-1])], remainder="passthrough")
X = np.array(ct.fit_transform(X)) 
print(X)

[[0.0 0.0 1.0 165349.2 136897.8 471784.1]
 [1.0 0.0 0.0 162597.7 151377.59 443898.53]
 [0.0 1.0 0.0 153441.51 101145.55 407934.54]
 [0.0 0.0 1.0 144372.41 118671.85 383199.62]
 [0.0 1.0 0.0 142107.34 91391.77 366168.42]
 [0.0 0.0 1.0 131876.9 99814.71 362861.36]
 [1.0 0.0 0.0 134615.46 147198.87 127716.82]
 [0.0 1.0 0.0 130298.13 145530.06 323876.68]
 [0.0 0.0 1.0 120542.52 148718.95 311613.29]
 [1.0 0.0 0.0 123334.88 108679.17 304981.62]
 [0.0 1.0 0.0 101913.08 110594.11 229160.95]
 [1.0 0.0 0.0 100671.96 91790.61 249744.55]
 [0.0 1.0 0.0 93863.75 127320.38 249839.44]
 [1.0 0.0 0.0 91992.39 135495.07 252664.93]
 [0.0 1.0 0.0 119943.24 156547.42 256512.92]
 [0.0 0.0 1.0 114523.61 122616.84 261776.23]
 [1.0 0.0 0.0 78013.11 121597.55 264346.06]
 [0.0 0.0 1.0 94657.16 145077.58 282574.31]
 [0.0 1.0 0.0 91749.16 114175.79 294919.57]
 [0.0 0.0 1.0 86419.7 153514.11 0.0]
 [1.0 0.0 0.0 76253.86 113867.3 298664.47]
 [0.0 0.0 1.0 78389.47 153773.43 299737.29]
 [0.0 1.0 0.0 73994.56 122782.75 3

In [47]:
from sklearn.model_selection import train_test_split
x_train, x_test, y_train, y_test = train_test_split(X, Y,train_size=0.8, random_state=10)

In [48]:
from sklearn.preprocessing import StandardScaler
sc = StandardScaler()

x_train[:, 3:] = sc.fit_transform(x_train[:, 3:])
x_test[:, 3:] = sc.transform(x_test[:, 3:])

In [49]:
from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import PolynomialFeatures
from sklearn.svm import SVR
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor

class Models:

    def __init__(self) -> None:
        pass

    def multi_linear(self, x_test, x_train, y_train):
        reg = LinearRegression()
        reg.fit(x_train, y_train)
        return reg.predict(x_test)

    def polynomial(self, x_test, x_train, y_train):
        pol = PolynomialFeatures(degree=1)
        pol_out = pol.fit_transform(x_train)
        reg = LinearRegression()
        reg.fit(pol_out, y_train)
        return reg.predict(pol.transform(x_test))
    
    def support_vector_regression(self, x_test, x_train, y_train):
        reg = SVR()
        reg.fit(x_train, y_train)
        return reg.predict(x_test)
    
    def decision_tree(self, x_test, x_train, y_train):
        reg = DecisionTreeRegressor(random_state=10)
        reg.fit(x_train, y_train)
        return reg.predict(x_test)

    def random_forest(self, x_test, x_train, y_train):
        reg = RandomForestRegressor(n_estimators= 20, random_state=10)
        reg.fit(x_train, y_train)
        return reg.predict(x_test)

In [50]:
regression = Models()

y_pred_multi_linear = regression.multi_linear(x_test, x_train, y_train)
y_pred_polynomial = regression.polynomial(x_test, x_train, y_train)
y_pred_svr = regression.support_vector_regression(x_test, x_train, y_train)
y_pred_decision = regression.decision_tree(x_test, x_train, y_train)
y_pred_random_forest = regression.random_forest(x_test, x_train, y_train)

from sklearn.metrics import r2_score


r2_multi_linear = r2_score(y_test, y_pred_multi_linear)
r2_polynomial = r2_score(y_test, y_pred_polynomial)
r2_svr = r2_score(y_test, y_pred_svr)
r2_decision = r2_score(y_test, y_pred_decision)
r2_random_forest = r2_score(y_test, y_pred_random_forest)

print("\nr2_score is:\n\n{} for multi linear regression\n {} for polynomial regression\n {} for svr\n {} for decision tree\n {} for random forest\n\n".format(r2_multi_linear, r2_polynomial, r2_svr, r2_decision, r2_random_forest))


output = np.stack((y_test, y_pred_multi_linear, y_pred_polynomial, y_pred_svr, y_pred_decision, y_pred_random_forest), axis = 1)
output = pd.DataFrame(output)
output.columns = ["y_test", "multi_linear", "polynomial", "svr", "decision_tree", "random_forest"]
print(output)


r2_score is:

0.9901105113397772 for multi linear regression
 0.990110511339777 for polynomial regression
 -0.0005303813915229494 for svr
 0.9120041185090322 for decision tree
 0.9408058832646812 for random forest


      y_test   multi_linear     polynomial            svr  decision_tree  \
0   89949.14   89173.727841   89173.727841  107977.961169       96479.51   
1  108733.99  110171.901449  110171.901449  107982.168525      110352.25   
2   65200.33   65822.477580   65822.477580  107975.425692       78239.91   
3   71498.49   70785.211426   70785.211426  107975.135251       77798.83   
4   42559.73   47652.126940   47652.126940  107975.169537       14681.40   
5  118474.03  116303.596652  116303.596652  107982.921071      110352.25   
6  182901.99  172176.280035  172176.280035  107986.379544      166187.94   
7   99937.59  100657.062541  100657.062541  107976.965800       97483.56   
8  155752.60  160473.252659  160473.252659  107985.814745      156991.12   
9  156122.51  158835.62