## Implementing Multiple Linear Regression

#### Importing the libraries

In [5]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

#### Importing the dataset

In [8]:
dataset = pd.read_csv('50_Startups.csv')

In [9]:
dataset.head()

Unnamed: 0,R&D Spend,Administration,Marketing Spend,State,Profit
0,165349.2,136897.8,471784.1,New York,192261.83
1,162597.7,151377.59,443898.53,California,191792.06
2,153441.51,101145.55,407934.54,Florida,191050.39
3,144372.41,118671.85,383199.62,New York,182901.99
4,142107.34,91391.77,366168.42,Florida,166187.94


In [10]:
dataset.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 50 entries, 0 to 49
Data columns (total 5 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   R&D Spend        50 non-null     float64
 1   Administration   50 non-null     float64
 2   Marketing Spend  50 non-null     float64
 3   State            50 non-null     object 
 4   Profit           50 non-null     float64
dtypes: float64(4), object(1)
memory usage: 2.1+ KB


#### Checking for null values

In [13]:
dataset.isnull().sum()

R&D Spend          0
Administration     0
Marketing Spend    0
State              0
Profit             0
dtype: int64

#### Splitting dataset in training and testing data

In [53]:
X = dataset.iloc[:,0:-1].values
y = dataset.iloc[:,-1:].values

#### Since we have categorical data state, let's encode it

In [54]:
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder

In [55]:
ct = ColumnTransformer(transformers=[('encoder',OneHotEncoder(), [3])], remainder='passthrough')

In [56]:
X = np.array(ct.fit_transform(X))

In [57]:
X

array([[0.0, 0.0, 1.0, 165349.2, 136897.8, 471784.1],
       [1.0, 0.0, 0.0, 162597.7, 151377.59, 443898.53],
       [0.0, 1.0, 0.0, 153441.51, 101145.55, 407934.54],
       [0.0, 0.0, 1.0, 144372.41, 118671.85, 383199.62],
       [0.0, 1.0, 0.0, 142107.34, 91391.77, 366168.42],
       [0.0, 0.0, 1.0, 131876.9, 99814.71, 362861.36],
       [1.0, 0.0, 0.0, 134615.46, 147198.87, 127716.82],
       [0.0, 1.0, 0.0, 130298.13, 145530.06, 323876.68],
       [0.0, 0.0, 1.0, 120542.52, 148718.95, 311613.29],
       [1.0, 0.0, 0.0, 123334.88, 108679.17, 304981.62],
       [0.0, 1.0, 0.0, 101913.08, 110594.11, 229160.95],
       [1.0, 0.0, 0.0, 100671.96, 91790.61, 249744.55],
       [0.0, 1.0, 0.0, 93863.75, 127320.38, 249839.44],
       [1.0, 0.0, 0.0, 91992.39, 135495.07, 252664.93],
       [0.0, 1.0, 0.0, 119943.24, 156547.42, 256512.92],
       [0.0, 0.0, 1.0, 114523.61, 122616.84, 261776.23],
       [1.0, 0.0, 0.0, 78013.11, 121597.55, 264346.06],
       [0.0, 0.0, 1.0, 94657.16, 145077.58

#### Splitting data into training and testing data

In [58]:
from sklearn.model_selection import train_test_split

In [76]:
X_train, X_test, y_train, y_test = train_test_split(X,y, test_size=0.2, random_state=0)

#### Now, building up Multiple linear regression model

In [77]:
from sklearn.linear_model import LinearRegression

In [78]:
regression = LinearRegression()

In [79]:
regression.fit(X_train, y_train)

LinearRegression()

now, lets test the model, we will predict on the test data

In [80]:
predictions = regression.predict(X_test)

In [81]:

print("Comparing test and predicted data side-by-side: \n")
np.concatenate((y_test, predictions),axis=1)

Comparing test and predicted data side-by-side: 



array([[103282.38      , 103015.20159796],
       [144259.4       , 132582.27760816],
       [146121.95      , 132447.73845174],
       [ 77798.83      ,  71976.09851258],
       [191050.39      , 178537.48221055],
       [105008.31      , 116161.24230166],
       [ 81229.06      ,  67851.69209676],
       [ 97483.56      ,  98791.73374686],
       [110352.25      , 113969.43533013],
       [166187.94      , 167921.06569551]])

#### Now, let's see the accuracy of this model R-square

In [83]:
from sklearn.metrics import r2_score

In [85]:
print(f"The score of the model ranging from -1 to +1 is :{r2_score(y_test, predictions)} ")

The score of the model ranging from -1 to +1 is :0.9347068473282303 
