# Multiple Linear Regression

In [1]:
import pandas as pd

## Import dataset

In [2]:
dataset=pd.read_csv("Startups_dataset.csv")
dataset

Unnamed: 0,R&D Spend,Administration,Marketing Spend,State,Profit
0,165349.3,136897.9,471784.2,New York,192261.93
1,162597.8,151377.69,443898.63,California,191792.16
2,153441.61,101145.65,407934.64,Florida,191050.49
3,144372.51,118671.95,383199.72,New York,182902.09
4,142107.44,91391.87,366168.52,Florida,166188.04
5,131877.0,99814.81,362861.46,New York,156991.22
6,134615.56,147198.97,127716.92,California,156122.61
7,130298.23,145530.16,323876.78,Florida,155752.7
8,120542.62,148719.05,311613.39,New York,152211.87
9,123334.98,108679.27,304981.72,California,149760.06


## Decode State Column

In [3]:
dataset=pd.get_dummies(dataset, drop_first=True)
dataset

Unnamed: 0,R&D Spend,Administration,Marketing Spend,Profit,State_Florida,State_New York
0,165349.3,136897.9,471784.2,192261.93,False,True
1,162597.8,151377.69,443898.63,191792.16,False,False
2,153441.61,101145.65,407934.64,191050.49,True,False
3,144372.51,118671.95,383199.72,182902.09,False,True
4,142107.44,91391.87,366168.52,166188.04,True,False
5,131877.0,99814.81,362861.46,156991.22,False,True
6,134615.56,147198.97,127716.92,156122.61,False,False
7,130298.23,145530.16,323876.78,155752.7,True,False
8,120542.62,148719.05,311613.39,152211.87,False,True
9,123334.98,108679.27,304981.72,149760.06,False,False


## Input , Output Split

In [4]:
dataset.columns

Index(['R&D Spend', 'Administration', 'Marketing Spend', 'Profit',
       'State_Florida', 'State_New York'],
      dtype='object')

In [8]:
independent=dataset[['R&D Spend', 'Administration', 'Marketing Spend','State_Florida', 'State_New York']]
independent

Unnamed: 0,R&D Spend,Administration,Marketing Spend,State_Florida,State_New York
0,165349.3,136897.9,471784.2,False,True
1,162597.8,151377.69,443898.63,False,False
2,153441.61,101145.65,407934.64,True,False
3,144372.51,118671.95,383199.72,False,True
4,142107.44,91391.87,366168.52,True,False
5,131877.0,99814.81,362861.46,False,True
6,134615.56,147198.97,127716.92,False,False
7,130298.23,145530.16,323876.78,True,False
8,120542.62,148719.05,311613.39,False,True
9,123334.98,108679.27,304981.72,False,False


In [10]:
dependent=dataset[['Profit']]
dependent

Unnamed: 0,Profit
0,192261.93
1,191792.16
2,191050.49
3,182902.09
4,166188.04
5,156991.22
6,156122.61
7,155752.7
8,152211.87
9,149760.06


## Train , Test Split

In [11]:
from sklearn.model_selection import train_test_split

In [12]:
x_train,x_test,y_train,y_test=train_test_split(independent,dependent,test_size=0.30,random_state=0)

In [13]:
x_train

Unnamed: 0,R&D Spend,Administration,Marketing Spend,State_Florida,State_New York
7,130298.23,145530.16,323876.78,True,False
14,119943.34,156547.52,256513.02,True,False
45,1000.33,124153.14,1904.03,False,True
48,542.15,51743.25,0.1,False,True
29,65605.58,153032.16,107138.48,False,True
15,114523.71,122616.94,261776.33,False,True
30,61994.58,115641.38,91131.34,True,False
32,63408.96,129219.71,46085.35,False,False
16,78013.21,121597.65,264346.16,False,False
42,23641.03,96189.73,148001.21,False,False


In [14]:
y_train

Unnamed: 0,Profit
7,155752.7
14,132602.75
45,64926.18
48,35673.51
29,101004.74
15,129917.14
30,99937.69
32,97427.94
16,126993.03
42,71498.59


## Model Creation/Train

In [15]:
from sklearn.linear_model import LinearRegression

In [16]:
multiRegressor=LinearRegression()
multiRegressor.fit(x_train,y_train)

0,1,2
,fit_intercept,True
,copy_X,True
,tol,1e-06
,n_jobs,
,positive,False


## Show w (weight) , b (bais) | y=w1x1+w2x2+w3x3+wnxn+b

In [17]:
weight=multiRegressor.coef_
weight

array([[7.90840255e-01, 3.01968165e-02, 3.10148566e-02, 4.63028992e+02,
        3.04799573e+02]])

In [18]:
bais=multiRegressor.intercept_
bais

array([42403.88566534])

## Test model

In [19]:
y_pred=multiRegressor.predict(x_test)
y_pred

array([[104282.86472172],
       [132536.98499212],
       [133910.95007766],
       [ 72584.87489417],
       [179921.0276189 ],
       [114549.41079234],
       [ 66444.53261346],
       [ 98405.06840122],
       [114499.92808602],
       [169367.60639895],
       [ 96522.7253998 ],
       [ 88040.7718287 ],
       [110950.09405525],
       [ 90419.2897851 ],
       [128020.56250064]])

## Evaluation Metrics / Cross check

In [20]:
from sklearn.metrics import r2_score


In [21]:
r_score=r2_score(y_test,y_pred)

In [22]:
r_score

0.9358680970046245

## Save the Model

In [23]:
import pickle

In [24]:
filename="finalized_model_multi_linear.sav"

In [25]:
pickle.dump(multiRegressor,open(filename,'wb'))

## Load the Model

In [26]:
loaded_model=pickle.load(open('finalized_model_multi_linear.sav', 'rb'))

In [28]:
result=loaded_model.predict([[1300,6700,8000,0,1]])  # State Florida = 1,0  |  State New York = 0,1 |  State California = 0,0 



In [30]:
result  # Profit

array([[44187.21509393]])