In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

In [2]:
df = pd.read_csv('./50_Startups.csv')

In [3]:
df.head(10)

Unnamed: 0,R&D Spend,Administration,Marketing Spend,State,Profit
0,165349.2,136897.8,471784.1,New York,192261.83
1,162597.7,151377.59,443898.53,California,191792.06
2,153441.51,101145.55,407934.54,Florida,191050.39
3,144372.41,118671.85,383199.62,New York,182901.99
4,142107.34,91391.77,366168.42,Florida,166187.94
5,131876.9,99814.71,362861.36,New York,156991.12
6,134615.46,147198.87,127716.82,California,156122.51
7,130298.13,145530.06,323876.68,Florida,155752.6
8,120542.52,148718.95,311613.29,New York,152211.77
9,123334.88,108679.17,304981.62,California,149759.96


In [4]:
df['State'].unique()

array(['New York', 'California', 'Florida'], dtype=object)

In [5]:
## creating dumies

In [6]:
a = pd.get_dummies(df['State'], drop_first = True)

In [7]:
a

Unnamed: 0,Florida,New York
0,0,1
1,0,0
2,1,0
3,0,1
4,1,0
5,0,1
6,0,0
7,1,0
8,0,1
9,0,0


In [8]:
df  = pd.concat([df, a], axis =1 )

In [9]:
df.head()

Unnamed: 0,R&D Spend,Administration,Marketing Spend,State,Profit,Florida,New York
0,165349.2,136897.8,471784.1,New York,192261.83,0,1
1,162597.7,151377.59,443898.53,California,191792.06,0,0
2,153441.51,101145.55,407934.54,Florida,191050.39,1,0
3,144372.41,118671.85,383199.62,New York,182901.99,0,1
4,142107.34,91391.77,366168.42,Florida,166187.94,1,0


In [10]:
df = df.drop(['State'] , axis = 1)
df.head()

Unnamed: 0,R&D Spend,Administration,Marketing Spend,Profit,Florida,New York
0,165349.2,136897.8,471784.1,192261.83,0,1
1,162597.7,151377.59,443898.53,191792.06,0,0
2,153441.51,101145.55,407934.54,191050.39,1,0
3,144372.41,118671.85,383199.62,182901.99,0,1
4,142107.34,91391.77,366168.42,166187.94,1,0


## splitting the data

In [11]:
x = df.drop(['Profit'], axis = 1) # independent
y = df['Profit']  # dependent

In [12]:
x.head()

Unnamed: 0,R&D Spend,Administration,Marketing Spend,Florida,New York
0,165349.2,136897.8,471784.1,0,1
1,162597.7,151377.59,443898.53,0,0
2,153441.51,101145.55,407934.54,1,0
3,144372.41,118671.85,383199.62,0,1
4,142107.34,91391.77,366168.42,1,0


In [13]:
y.head()

0    192261.83
1    191792.06
2    191050.39
3    182901.99
4    166187.94
Name: Profit, dtype: float64

In [14]:
# checking null values or not

In [15]:
df.isnull().sum()

R&D Spend          0
Administration     0
Marketing Spend    0
Profit             0
Florida            0
New York           0
dtype: int64

In [16]:
from sklearn.model_selection import train_test_split

In [17]:
df.shape

(50, 6)

In [18]:
X_train, X_test, y_train, y_test = train_test_split( x, y, test_size=0.33, random_state=42)

In [19]:
len(X_train), len(y_train)

(33, 33)

In [20]:
X_train.head()

Unnamed: 0,R&D Spend,Administration,Marketing Spend,Florida,New York
46,1315.46,115816.21,297114.46,1,0
47,0.0,135426.92,0.0,0,0
15,114523.61,122616.84,261776.23,0,1
9,123334.88,108679.17,304981.62,0,0
16,78013.11,121597.55,264346.06,0,0


In [21]:
#import and fit

In [22]:
from sklearn.linear_model import LinearRegression

In [23]:
reg =  LinearRegression()

In [24]:
# y = m1x1 + m2x2 + m3x3 + m4x4

In [25]:
reg.fit(X_train,y_train)

LinearRegression()

In [26]:
X_train.shape

(33, 5)

In [27]:
reg.coef_

array([ 8.04312540e-01, -8.99140461e-02,  2.84812890e-02,  4.03924487e+02,
       -1.21407965e+03])

In [28]:
reg.intercept_

57183.412952172

In [29]:
# y = m1x1 + m2x2 + m3x3 + m4x4 + m5x5 + c

#  Developed model

y =  8.04312540e-01 * x1 + -8.99140461e-02 * x2 + 2.84812890e-02 * x3 + 4.03924487e+02 * x4 + -1.21407965e+03 * x5 + c 

## checking training data performance

In [30]:
y_train_pred = reg.predict(X_train)

In [31]:
y_train_pred[:10] # predictions points

array([ 56694.07716541,  45006.63062018, 144512.857242  , 155297.68934747,
       116525.92444407, 113013.76150971,  86349.3742125 ,  93924.61183384,
       190089.75258858,  61909.01028595])

In [32]:
y_train[:10] # actual points

46     49490.75
47     42559.73
15    129917.04
9     149759.96
16    126992.93
24    108552.04
34     96712.80
31     97483.56
0     192261.83
44     65200.33
Name: Profit, dtype: float64

In [33]:
# model accuracy for training data

from sklearn.metrics import r2_score

In [34]:
r2_score(y_train,y_train_pred)

0.9483841429755522

In [35]:
## training acc = 96 %

In [36]:
## loss 
1- r2_score(y_train,y_train_pred)

0.05161585702444782

In [37]:
# loss = 0.05161585702444782

In [38]:
0.9483841429755522 +0.05161585702444782

1.0

## test data performance

In [39]:
y_test_pred = reg.predict(X_test)

In [40]:
r2_score(y_test,y_test_pred)

0.9475312056928278

In [41]:
0.9475312056928278 * 100

94.75312056928277

In [42]:
## loss
1- r2_score(y_test,y_test_pred)

0.05246879430717222

In [43]:
0.05246879430717222 * 100

5.246879430717222

In [44]:
94.75312056928277 + 5.246879430717222

100.0

In [47]:
X_train.head()

Unnamed: 0,R&D Spend,Administration,Marketing Spend,Florida,New York
46,1315.46,115816.21,297114.46,1,0
47,0.0,135426.92,0.0,0,0
15,114523.61,122616.84,261776.23,0,1
9,123334.88,108679.17,304981.62,0,0
16,78013.11,121597.55,264346.06,0,0


In [48]:
#Checking with real data

In [49]:
reg.predict([[1.1,10.9,12,0,0]])



array([57183.65940833])

In [None]:
# 88 %