# Importing the libraries

In [1]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd

# Importing the dataset

In [12]:
dataset = pd.read_csv('datasets/50_Startups.csv')
X = dataset.iloc[:, :-1].values
y = dataset.iloc[:, -1].values

# Encoding categorical data

In [3]:
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder
ct = ColumnTransformer(transformers=[('encoder', OneHotEncoder(), [3])], remainder='passthrough')
X = np.array(ct.fit_transform(X))


# Splitting the dataset into the Training set and Test set

In [4]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 0)

# Training the Multiple Linear Regression model on the Training set


In [5]:
from sklearn.linear_model import LinearRegression
regressor = LinearRegression()
regressor.fit(X_train, y_train)

LinearRegression()

# Predicting the Test set results

In [6]:
y_pred = regressor.predict(X_test)
# print(y_pred)
np.set_printoptions(precision=5)
print(np.concatenate((y_pred.reshape(len(y_pred),1), y_test.reshape(len(y_test),1)),1))

[[103015.2016  103282.38   ]
 [132582.27761 144259.4    ]
 [132447.73845 146121.95   ]
 [ 71976.09851  77798.83   ]
 [178537.48221 191050.39   ]
 [116161.2423  105008.31   ]
 [ 67851.6921   81229.06   ]
 [ 98791.73375  97483.56   ]
 [113969.43533 110352.25   ]
 [167921.0657  166187.94   ]]


# Making a single prediction (for example the profit of a startup with R&D Spend = 160000, Administration Spend = 130000, Marketing Spend = 300000 and State = 'California')

In [7]:
print(regressor.predict([[1, 0, 0, 160000, 130000, 300000]]))

[181566.92389]


Therefore, our model predicts that the profit of a Californian startup which spent 160000 in R&D, 130000 in Administration and 300000 in Marketing is $ 181566,92.

Important note 1: Notice that the values of the features were all input in a double pair of square brackets. That's because the "predict" method always expects a 2D array as the format of its inputs. And putting our values into a double pair of square brackets makes the input exactly a 2D array. Simply put:

1,0,0,160000,130000,300000→scalars 

[1,0,0,160000,130000,300000]→1D array 

[[1,0,0,160000,130000,300000]]→2D array 



# Getting the final linear regression equation with the values of the coefficients

In [8]:
print(regressor.coef_)
print(regressor.intercept_)

[ 8.66384e+01 -8.72646e+02  7.86007e+02  7.73467e-01  3.28846e-02
  3.66100e-02]
42467.529248549545


Therefore, the equation of our multiple linear regression model is:

Profit=86.6×Dummy State 1−873×Dummy State 2+786×Dummy State 3−0.773×R&D Spend+0.0329×Administration+0.0366×Marketing Spend+42467.53

Important Note: To get these coefficients we called the "coef_" and "intercept_" attributes from our regressor object. Attributes in Python are different than methods and usually return a simple value or an array of values.