In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
sns.set()

# Load Data

In [2]:
data=pd.read_csv("50_Startups.csv")

In [4]:
data.head(5)

Unnamed: 0,R&D Spend,Administration,Marketing Spend,State,Profit
0,165349.2,136897.8,471784.1,New York,192261.83
1,162597.7,151377.59,443898.53,California,191792.06
2,153441.51,101145.55,407934.54,Florida,191050.39
3,144372.41,118671.85,383199.62,New York,182901.99
4,142107.34,91391.77,366168.42,Florida,166187.94


In [5]:
data.State.unique()

array(['New York', 'California', 'Florida'], dtype=object)

In [6]:
data1=data.copy()

In [8]:
data1.describe()

Unnamed: 0,R&D Spend,Administration,Marketing Spend,Profit
count,50.0,50.0,50.0,50.0
mean,73721.6156,121344.6396,211025.0978,112012.6392
std,45902.256482,28017.802755,122290.310726,40306.180338
min,0.0,51283.14,0.0,14681.4
25%,39936.37,103730.875,129300.1325,90138.9025
50%,73051.08,122699.795,212716.24,107978.19
75%,101602.8,144842.18,299469.085,139765.9775
max,165349.2,182645.56,471784.1,192261.83


In [9]:
data1.head(5)

Unnamed: 0,R&D Spend,Administration,Marketing Spend,State,Profit
0,165349.2,136897.8,471784.1,New York,192261.83
1,162597.7,151377.59,443898.53,California,191792.06
2,153441.51,101145.55,407934.54,Florida,191050.39
3,144372.41,118671.85,383199.62,New York,182901.99
4,142107.34,91391.77,366168.42,Florida,166187.94


# Create Regression using StatsModels

Define Variables

In [10]:
y=data1["Profit"]
x=data[["R&D Spend","Administration","Marketing Spend"]]

In [11]:
import statsmodels.api as sm

In [12]:
x1=sm.add_constant(x)
results=sm.OLS(y,x1).fit()
results.summary()

0,1,2,3
Dep. Variable:,Profit,R-squared:,0.951
Model:,OLS,Adj. R-squared:,0.948
Method:,Least Squares,F-statistic:,296.0
Date:,"Wed, 26 Jul 2023",Prob (F-statistic):,4.53e-30
Time:,15:49:16,Log-Likelihood:,-525.39
No. Observations:,50,AIC:,1059.0
Df Residuals:,46,BIC:,1066.0
Df Model:,3,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
const,5.012e+04,6572.353,7.626,0.000,3.69e+04,6.34e+04
R&D Spend,0.8057,0.045,17.846,0.000,0.715,0.897
Administration,-0.0268,0.051,-0.526,0.602,-0.130,0.076
Marketing Spend,0.0272,0.016,1.655,0.105,-0.006,0.060

0,1,2,3
Omnibus:,14.838,Durbin-Watson:,1.282
Prob(Omnibus):,0.001,Jarque-Bera (JB):,21.442
Skew:,-0.949,Prob(JB):,2.21e-05
Kurtosis:,5.586,Cond. No.,1400000.0


In [14]:
x1.head(2)

Unnamed: 0,const,R&D Spend,Administration,Marketing Spend
0,1.0,165349.2,136897.8,471784.1
1,1.0,162597.7,151377.59,443898.53


# Making Predictions

In [15]:
new_data=pd.DataFrame({"const":1,"R&D Spend":[170000,175000],
                       "Administration":[100000,120000],
                       "Marketing Spend":[450000,430050]})

In [16]:
new_data

Unnamed: 0,const,R&D Spend,Administration,Marketing Spend
0,1,170000,100000,450000
1,1,175000,120000,430050


In [17]:
predictions=results.predict(new_data)

In [18]:
predictions

0    196664.783796
1    199613.839785
dtype: float64

In [23]:
new_data["Predicted_profit"]=predictions

In [24]:
new_data

Unnamed: 0,const,R&D Spend,Administration,Marketing Spend,predictions,Predicted_profit
0,1,170000,100000,450000,196664.783796,196664.783796
1,1,175000,120000,430050,199613.839785,199613.839785


# Model Creation Using ScikitLearn

In [25]:
from sklearn.linear_model import LinearRegression

Define Variable

In [26]:
x=data1[["R&D Spend", "Administration","Marketing Spend"]]
y=data1["Profit"]

In [27]:
reg=LinearRegression()

In [28]:
reg=reg.fit(x,y)

In [29]:
reg.coef_

array([ 0.80571505, -0.02681597,  0.02722806])

In [30]:
reg.intercept_

50122.19298986524

# Calculate R-Squared Value

In [32]:
reg.score(x,y)

0.9507459940683246

# Adjusted R-Squared value

$Adj_R2=  1−(1−𝑅2)∗(𝑛−1)/(𝑛−𝑝)$

In [34]:
R2=reg.score(x,y)
n=x.shape[0]
p=x.shape[1]
print(R2,n,p)

0.9507459940683246 50 3


In [37]:
Y=reg.intercept_+reg.coef_[0]*data1["R&D Spend"]+reg.coef_[1]*data["Administration"]+reg.coef_[2]*data["Marketing Spend"]

In [39]:
new_data1=new_data=pd.DataFrame({"R&D Spend":[170000,175000],
                       "Administration":[100000,120000],
                       "Marketing Spend":[450000,430050]})

In [40]:
new_data1

Unnamed: 0,R&D Spend,Administration,Marketing Spend
0,170000,100000,450000
1,175000,120000,430050


In [41]:
reg.predict(new_data1)

array([196664.78379644, 199613.83978534])

In [42]:
new_data1["Profit"]=reg.predict(new_data1)

In [43]:
new_data1

Unnamed: 0,R&D Spend,Administration,Marketing Spend,Profit
0,170000,100000,450000,196664.783796
1,175000,120000,430050,199613.839785
