## Steps to build the model

### 1) Import all necessary libraries

In [6]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.metrics import r2_score, mean_squared_error

### 2) Load the DataFrame

In [7]:
startup = pd.read_csv('50_Startups.csv')

In [9]:
startup.head()

Unnamed: 0,R&D Spend,Administration,Marketing Spend,State,Profit
0,165349.2,136897.8,471784.1,New York,192261.83
1,162597.7,151377.59,443898.53,California,191792.06
2,153441.51,101145.55,407934.54,Florida,191050.39
3,144372.41,118671.85,383199.62,New York,182901.99
4,142107.34,91391.77,366168.42,Florida,166187.94


In [10]:
startup.isnull().sum()

R&D Spend          0
Administration     0
Marketing Spend    0
State              0
Profit             0
dtype: int64

In [12]:
startup.describe().transpose()

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
R&D Spend,50.0,73721.6156,45902.256482,0.0,39936.37,73051.08,101602.8,165349.2
Administration,50.0,121344.6396,28017.802755,51283.14,103730.875,122699.795,144842.18,182645.56
Marketing Spend,50.0,211025.0978,122290.310726,0.0,129300.1325,212716.24,299469.085,471784.1
Profit,50.0,112012.6392,40306.180338,14681.4,90138.9025,107978.19,139765.9775,192261.83


In [13]:
startup.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 50 entries, 0 to 49
Data columns (total 5 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   R&D Spend        50 non-null     float64
 1   Administration   50 non-null     float64
 2   Marketing Spend  50 non-null     float64
 3   State            50 non-null     object 
 4   Profit           50 non-null     float64
dtypes: float64(4), object(1)
memory usage: 2.1+ KB


In [18]:
startup.duplicated().sum()

0

In [16]:
corr = startup.corr()
corr

Unnamed: 0,R&D Spend,Administration,Marketing Spend,Profit
R&D Spend,1.0,0.241955,0.724248,0.9729
Administration,0.241955,1.0,-0.032154,0.200717
Marketing Spend,0.724248,-0.032154,1.0,0.747766
Profit,0.9729,0.200717,0.747766,1.0


### 3) Define the features (x) and label (y)

In [21]:
X = startup.iloc[:, :-2].values
y = startup.iloc[:, -1].values

### 4) Split our dataset into training and testing dataset

In [24]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0)

### 5) Build the model

In [27]:
regressor = LinearRegression()
regressor.fit(X_train, y_train)

LinearRegression()

In [28]:
y_prediction = regressor.predict(X_test)

In [29]:
y_prediction

array([103901.8969696 , 132763.05993126, 133567.90370044,  72911.78976736,
       179627.92567224, 115166.64864795,  67113.5769057 ,  98154.80686776,
       114756.11555221, 169064.01408795])

In [42]:
pd.DataFrame({'Actual Profit': y_test, 'Predicted Profit': y_prediction})

Unnamed: 0,Actual Profit,Predicted Profit
0,103282.38,103901.89697
1,144259.4,132763.059931
2,146121.95,133567.9037
3,77798.83,72911.789767
4,191050.39,179627.925672
5,105008.31,115166.648648
6,81229.06,67113.576906
7,97483.56,98154.806868
8,110352.25,114756.115552
9,166187.94,169064.014088


### 6) Find the accuracy score using r_squared 

In [32]:
score = r2_score(y_prediction, y_test)

In [40]:
round(score * 100, 2)

93.59

### 7) Predict

In [41]:
regressor.predict([[250000, 550000, 170000]])

array([259765.55701708])