In [1]:
# Importing necessary libraries
import pandas as pd

In [2]:
# Loading the dataset
Dataset = pd.read_csv("50_Startups.csv")
Dataset.head()

Unnamed: 0,R&D Spend,Administration,Marketing Spend,State,Profit
0,165349.2,136897.8,471784.1,New York,192261.83
1,162597.7,151377.59,443898.53,California,191792.06
2,153441.51,101145.55,407934.54,Florida,191050.39
3,144372.41,118671.85,383199.62,New York,182901.99
4,142107.34,91391.77,366168.42,Florida,166187.94


In [3]:
# One hot encoding
dataset = pd.get_dummies(Dataset,drop_first=True).astype(int)
dataset.head()    # Displays only the first 5 rows in the dataset

Unnamed: 0,R&D Spend,Administration,Marketing Spend,Profit,State_Florida,State_New York
0,165349,136897,471784,192261,0,1
1,162597,151377,443898,191792,0,0
2,153441,101145,407934,191050,1,0
3,144372,118671,383199,182901,0,1
4,142107,91391,366168,166187,1,0


In [4]:
# Displaying the column names in the dataset
dataset.columns

Index(['R&D Spend', 'Administration', 'Marketing Spend', 'Profit',
       'State_Florida', 'State_New York'],
      dtype='object')

In [5]:
# Feature selection or Input for MLR
independent = dataset[['R&D Spend', 'Administration', 'Marketing Spend','State_Florida', 'State_New York']]
independent.head()

Unnamed: 0,R&D Spend,Administration,Marketing Spend,State_Florida,State_New York
0,165349,136897,471784,0,1
1,162597,151377,443898,0,0
2,153441,101145,407934,1,0
3,144372,118671,383199,0,1
4,142107,91391,366168,1,0


In [6]:
# Target identification or output selection
dependent = dataset[['Profit']]
dependent.head()

Unnamed: 0,Profit
0,192261
1,191792
2,191050
3,182901
4,166187


In [7]:
# Splitting the dataset into training and testing sets
from sklearn.model_selection import train_test_split
X_train,X_test,Y_train,Y_test = train_test_split(independent,dependent,test_size=0.25,random_state=0)

In [8]:
X_train.shape

(37, 5)

In [9]:
X_test.shape

(13, 5)

In [10]:
Y_train.shape

(37, 1)

In [11]:
Y_test.shape

(13, 1)

In [12]:
# Model creation and training
from sklearn.linear_model import LinearRegression
regressor = LinearRegression()
regressor.fit(X_train,Y_train)

In [13]:
# displaying the weights
weights = regressor.coef_
weights

array([[ 7.80990089e-01,  4.57969427e-02,  3.35424266e-02,
        -5.21996350e+02,  1.11281396e+01]])

In [14]:
# Displaying the bias
bias = regressor.intercept_
bias

array([41049.19096689])

In [15]:
# Model prediction on test data
Y_pred = regressor.predict(X_test)
Y_pred

array([[104439.92423106],
       [132252.96540897],
       [132871.68715232],
       [ 71706.75210861],
       [178678.32293309],
       [115077.56661322],
       [ 66093.37626795],
       [ 98759.21395052],
       [114112.87079973],
       [167978.94489241],
       [ 95785.71835983],
       [ 87785.22855086],
       [110454.92069199]])

In [16]:
# Model Evaluation
from sklearn.metrics import r2_score
score = r2_score(Y_test,Y_pred)
score

0.9315812627456479

In [17]:
# Saving the model
import pickle
filename = "Profit_Prediction_Saved_Model.sav"
pickle.dump(regressor,open(filename,'wb'))

In [18]:
# checking the model before deploynment
Loaded_model = pickle.load(open("Profit_Prediction_Saved_Model.sav",'rb'))
result = Loaded_model.predict([[13456,98759.86,19999.9,0,0]])
result



array([[56751.93844061]])