In [None]:
# Import the required lib

import pandas as pd
import numpy as np

In [None]:
# Reading the data

df = pd.read_csv('/content/50_Startups.csv')
df.head()

Unnamed: 0,R&D Spend,Administration,Marketing Spend,State,Profit
0,165349.2,136897.8,471784.1,New York,192261.83
1,162597.7,151377.59,443898.53,California,191792.06
2,153441.51,101145.55,407934.54,Florida,191050.39
3,144372.41,118671.85,383199.62,New York,182901.99
4,142107.34,91391.77,366168.42,Florida,166187.94


In [None]:
# Checking the dtype

df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 108 entries, 0 to 107
Data columns (total 5 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   R&D Spend        108 non-null    float64
 1   Administration   108 non-null    float64
 2   Marketing Spend  108 non-null    float64
 3   State            108 non-null    object 
 4   Profit           108 non-null    float64
dtypes: float64(4), object(1)
memory usage: 4.3+ KB


In [None]:
# Descriptive stat

df.describe()

Unnamed: 0,R&D Spend,Administration,Marketing Spend,Profit
count,108.0,108.0,108.0,108.0
mean,74959.338704,121750.788889,214952.664722,113523.76
std,44996.368152,27322.385654,117937.94212,38991.013654
min,0.0,51283.14,0.0,14681.4
25%,38558.51,105077.645,134050.07,90708.19
50%,75791.365,122699.795,239452.75,109543.12
75%,101913.08,145077.58,298664.47,141585.52
max,165349.2,182645.56,471784.1,192261.83


In [None]:
# Checking for the null values

df.isnull()

Unnamed: 0,R&D Spend,Administration,Marketing Spend,State,Profit
0,False,False,False,False,False
1,False,False,False,False,False
2,False,False,False,False,False
3,False,False,False,False,False
4,False,False,False,False,False
...,...,...,...,...,...
103,False,False,False,False,False
104,False,False,False,False,False
105,False,False,False,False,False
106,False,False,False,False,False


In [None]:
df.isnull().any()

R&D Spend          False
Administration     False
Marketing Spend    False
State              False
Profit             False
dtype: bool

In [None]:
# Converting object datatype to int

df['State'].unique()

array(['New York', 'California', 'Florida'], dtype=object)

In [None]:
df['State'].value_counts()

New York      39
California    36
Florida       33
Name: State, dtype: int64

In [None]:
# Label encoding

from sklearn.preprocessing import LabelEncoder

le = LabelEncoder()
# column name = le.fit_transform(column name)

In [None]:
# Manual encoding

df['State'] = df['State'].replace({'New York':0,
                                   'California':1,
                                   'Florida':2})

In [None]:
df.head()

Unnamed: 0,R&D Spend,Administration,Marketing Spend,State,Profit
0,165349.2,136897.8,471784.1,0,192261.83
1,162597.7,151377.59,443898.53,1,191792.06
2,153441.51,101145.55,407934.54,2,191050.39
3,144372.41,118671.85,383199.62,0,182901.99
4,142107.34,91391.77,366168.42,2,166187.94


In [None]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 108 entries, 0 to 107
Data columns (total 5 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   R&D Spend        108 non-null    float64
 1   Administration   108 non-null    float64
 2   Marketing Spend  108 non-null    float64
 3   State            108 non-null    int64  
 4   Profit           108 non-null    float64
dtypes: float64(4), int64(1)
memory usage: 4.3 KB


In [None]:
# Dependent and independent variable

x = df.drop('Profit',axis=1)
y = df['Profit']

In [None]:
x

Unnamed: 0,R&D Spend,Administration,Marketing Spend,State
0,165349.20,136897.80,471784.10,0
1,162597.70,151377.59,443898.53,1
2,153441.51,101145.55,407934.54,2
3,144372.41,118671.85,383199.62,0
4,142107.34,91391.77,366168.42,2
...,...,...,...,...
103,119943.24,156547.42,256512.92,2
104,114523.61,122616.84,261776.23,0
105,78013.11,121597.55,264346.06,1
106,94657.16,145077.58,282574.31,0


In [None]:
y

0      192261.83
1      191792.06
2      191050.39
3      182901.99
4      166187.94
         ...    
103    132602.65
104    129917.04
105    126992.93
106    125370.37
107    124266.90
Name: Profit, Length: 108, dtype: float64

In [None]:
# Train and test split

from sklearn.model_selection import train_test_split

In [None]:
xtrain,xtest,ytrain,ytest = train_test_split(x,y,test_size=0.25,random_state=12)

In [None]:
print("The shape of xtrain is {}".format(xtrain.shape))
print("The shape of xtest is {}".format(xtest.shape))
print("The shape of ytrain is {}".format(ytrain.shape))
print("The shape of ytest is {}".format(ytest.shape))

The shape of xtrain is (81, 4)
The shape of xtest is (27, 4)
The shape of ytrain is (81,)
The shape of ytest is (27,)


In [None]:
# Model Building

from sklearn.linear_model import LinearRegression

In [None]:
# Initializing Linear regression

lr = LinearRegression()

In [None]:
# Train the model

lr.fit(xtrain,ytrain)

In [None]:
# Predicting the model and saving to a variable

ypred = lr.predict(xtest)

In [None]:
ypred

array([ 77670.98400838, 121105.48937758, 113048.1602188 , 154016.16750334,
        84513.58545908, 148544.16791123, 128218.24283949, 121105.48937758,
       109435.6930123 ,  72137.7777111 , 159409.92869125,  77180.89116199,
        98293.32063942, 132431.17296715, 153142.18082396, 153142.18082396,
        61709.69306247, 174222.26857343,  61709.69306247, 154016.16750334,
       128668.22147415, 113048.1602188 , 128668.22147415, 102520.41678047,
       189979.36896937, 146238.11457759,  72049.04668261])

In [None]:
ytest

36      90708.19
21     111313.02
19     122776.86
98     149759.96
39      81005.76
14     132602.65
102    134307.35
60     111313.02
26     105733.54
81      71498.49
7      155752.60
40      78239.91
31      97483.56
17     125370.37
97     152211.77
8      152211.77
46      49490.75
92     182901.99
85      49490.75
9      149759.96
51     141585.52
58     122776.86
12     141585.52
68     101004.64
1      191792.06
15     129917.04
38      81229.06
Name: Profit, dtype: float64

In [None]:
compare = pd.DataFrame([ytest.values,ypred])

In [None]:
compare = compare.T

In [None]:
compare

Unnamed: 0,0,1
0,90708.19,77670.984008
1,111313.02,121105.489378
2,122776.86,113048.160219
3,149759.96,154016.167503
4,81005.76,84513.585459
5,132602.65,148544.167911
6,134307.35,128218.242839
7,111313.02,121105.489378
8,105733.54,109435.693012
9,71498.49,72137.777711


# Evaluation Metrics for Regression ML models

In [38]:
from sklearn.metrics import mean_squared_error,r2_score

In [39]:
from sklearn import metrics
# R- Square
print(metrics.r2_score(ytest,ypred))


0.9414933853155694


In [40]:
# Mean Square Error
mean_squared_error(ytest,ypred)

75390264.7408362

In [41]:
# Root Mean Square Error
np.sqrt(mean_squared_error(ytest,ypred))

8682.756747763708

## Checking the model accuracy after scaling

In [42]:
# Scaling the data

from sklearn.preprocessing import StandardScaler

In [43]:
st = StandardScaler()

In [55]:
xtrain_sc = pd.DataFrame(st.fit_transform(xtrain),columns = xtrain.columns)
xtrain_sc.head()

Unnamed: 0,R&D Spend,Administration,Marketing Spend,State
0,0.096791,-0.703154,-0.560765,-1.217837
1,0.649145,-0.294253,0.17274,1.2795
2,-1.165084,1.227741,-0.994285,-1.217837
3,-0.114463,-0.469306,0.798783,1.2795
4,-1.614392,-0.063296,-1.350705,0.030831


In [46]:
lr1 = LinearRegression()

In [47]:
lr1.fit(xtrain_sc,ytrain)

In [48]:
ypred1 = lr1.predict(xtest_sc)

In [50]:
metrics.r2_score(ytest,ypred1)

0.9414933853155694

In [49]:
mean_squared_error(ytest,ypred1)

75390264.7408362

In [56]:
np.sqrt(mean_squared_error(ytest,ypred1))

8682.756747763708