# Importing the Libraries

In [1]:
import numpy as np
import pandas as pd

# Importing the Dataset

In [3]:
df = pd.read_csv("50_Startups.csv");
df

Unnamed: 0,R&D Spend,Administration,Marketing Spend,State,Profit
0,165349.20,136897.80,471784.10,New York,192261.83
1,162597.70,151377.59,443898.53,California,191792.06
2,153441.51,101145.55,407934.54,Florida,191050.39
3,144372.41,118671.85,383199.62,New York,182901.99
4,142107.34,91391.77,366168.42,Florida,166187.94
...,...,...,...,...,...
103,119943.24,156547.42,256512.92,Florida,132602.65
104,114523.61,122616.84,261776.23,New York,129917.04
105,78013.11,121597.55,264346.06,California,126992.93
106,94657.16,145077.58,282574.31,New York,125370.37


# Analyse the Data

In [5]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 108 entries, 0 to 107
Data columns (total 5 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   R&D Spend        108 non-null    float64
 1   Administration   108 non-null    float64
 2   Marketing Spend  108 non-null    float64
 3   State            108 non-null    object 
 4   Profit           108 non-null    float64
dtypes: float64(4), object(1)
memory usage: 4.3+ KB


# Check whether we have NULL Value

In [6]:
df.isnull().any()

R&D Spend          False
Administration     False
Marketing Spend    False
State              False
Profit             False
dtype: bool

## Note: **Machine can't understand the State Datatype, So we will be using Label Encoders to convert them...**

### First find Unique Value

In [7]:
df["State"].unique()

array(['New York', 'California', 'Florida'], dtype=object)

# Apply Label Encoder

In [8]:
from sklearn.preprocessing import LabelEncoder
LE = LabelEncoder()
df["State"] = LE.fit_transform(df["State"])
df.head()

Unnamed: 0,R&D Spend,Administration,Marketing Spend,State,Profit
0,165349.2,136897.8,471784.1,2,192261.83
1,162597.7,151377.59,443898.53,0,191792.06
2,153441.51,101145.55,407934.54,1,191050.39
3,144372.41,118671.85,383199.62,2,182901.99
4,142107.34,91391.77,366168.42,1,166187.94


# Dividing Dependent and Independent Values

In [14]:
x = df.iloc[:,0:4].values
y = df.iloc[:,4:5].values

# Checking the Shape of X and Y

In [16]:
x.shape

(108, 4)

In [17]:
y.shape

(108, 1)

# One Hot Encoder
### This returns the data as 1 for the Category it is True for, and 0 for the rest...

In [20]:
from sklearn.preprocessing import OneHotEncoder
One = OneHotEncoder()
z = One.fit_transform(x[:,3:4]).toarray()
z

array([[0., 0., 1.],
       [1., 0., 0.],
       [0., 1., 0.],
       [0., 0., 1.],
       [0., 1., 0.],
       [0., 0., 1.],
       [1., 0., 0.],
       [0., 1., 0.],
       [0., 0., 1.],
       [1., 0., 0.],
       [0., 1., 0.],
       [1., 0., 0.],
       [0., 1., 0.],
       [1., 0., 0.],
       [0., 1., 0.],
       [0., 0., 1.],
       [1., 0., 0.],
       [0., 0., 1.],
       [0., 1., 0.],
       [0., 0., 1.],
       [1., 0., 0.],
       [0., 0., 1.],
       [0., 1., 0.],
       [0., 1., 0.],
       [0., 0., 1.],
       [1., 0., 0.],
       [0., 1., 0.],
       [0., 0., 1.],
       [0., 1., 0.],
       [0., 0., 1.],
       [0., 1., 0.],
       [0., 0., 1.],
       [1., 0., 0.],
       [0., 1., 0.],
       [1., 0., 0.],
       [0., 0., 1.],
       [0., 1., 0.],
       [1., 0., 0.],
       [0., 0., 1.],
       [1., 0., 0.],
       [1., 0., 0.],
       [0., 1., 0.],
       [1., 0., 0.],
       [0., 0., 1.],
       [1., 0., 0.],
       [0., 0., 1.],
       [0., 1., 0.],
       [1., 0

# Delete State

In [21]:
x = np.delete(x, 3, axis=1)

In [22]:
x.shape

(108, 3)

# Concatenate z into DataFrame

In [23]:
x = np.concatenate((z,x),axis = 1)

In [24]:
x.shape

(108, 6)

# Split Data into Training and Testing Set

In [25]:
from random import Random
from sklearn.model_selection import train_test_split
x_train, x_test, y_train, y_test = train_test_split(x,y,test_size = 0.2, random_state = 0)

In [26]:
x_train.shape

(86, 6)

In [27]:
x_test.shape

(22, 6)

# Model Building

In [28]:
from sklearn.linear_model import LinearRegression
LR = LinearRegression()
LR.fit(x_train, y_train)

In [30]:
YPred = LR.predict(x_test)
YPred

array([[ 48466.75433417],
       [134429.39628447],
       [ 76093.94745793],
       [181201.94746512],
       [113069.45110036],
       [134397.50337433],
       [128842.89457941],
       [159664.83856912],
       [116952.09718497],
       [ 46372.47771261],
       [102406.37641003],
       [115207.87076438],
       [ 48466.75433417],
       [119345.51846304],
       [ 88717.92542862],
       [127301.6808492 ],
       [127301.6808492 ],
       [ 91145.83032982],
       [ 58337.137746  ],
       [146474.04747225],
       [149038.44713347],
       [152718.63782934]])

In [32]:
y_test

array([[ 64926.08],
       [146121.95],
       [ 90708.19],
       [191050.39],
       [108552.04],
       [144259.4 ],
       [124266.9 ],
       [155752.6 ],
       [126992.93],
       [ 42559.73],
       [101004.64],
       [110352.25],
       [ 64926.08],
       [111313.02],
       [ 89949.14],
       [134307.35],
       [134307.35],
       [ 96712.8 ],
       [ 49490.75],
       [129917.04],
       [132602.65],
       [152211.77]])

# Evaluate Model Performance

In [35]:
from sklearn.metrics import r2_score
Acc = r2_score(YPred, y_test)
Acc

0.9316315637551514

# Prediction on Random Value

In [36]:
yp = LR.predict([[1,0,0,1242345,12345,234567]])
yp

array([[1025521.22961573]])