In [3]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

In [4]:
df = pd.read_csv(r"C:\Users\DELL\OneDrive\Data Science\class Folder\50_Startups.csv")
df.head()

Unnamed: 0,R&D Spend,Administration,Marketing Spend,State,Profit
0,165349.2,136897.8,471784.1,New York,192261.83
1,162597.7,151377.59,443898.53,California,191792.06
2,153441.51,101145.55,407934.54,Florida,191050.39
3,144372.41,118671.85,383199.62,New York,182901.99
4,142107.34,91391.77,366168.42,Florida,166187.94


In [20]:
# Splitting independent and dependent variables 
X = df.iloc[:, :-1]
y = df.iloc[:, -1]

y.head()

0    192261.83
1    191792.06
2    191050.39
3    182901.99
4    166187.94
Name: Profit, dtype: float64

In [21]:
# How many categorical variables we have 1 State
df['State'].unique()

array(['New York', 'California', 'Florida'], dtype=object)

- The State column in the independent variable is a categorical column with 3 unique categories - New York, California and Florida

- Convert the States into One Hot Encoding

    New York => (1, 0, 0)

    California => (0, 1, 0)

    Florida => (0, 0, 1)

- Dummy Variable Trap

    New York => (0, 0)

    California => (1, 0)

    Florida => (0, 1)
    
    
https://medium.com/analytics-vidhya/linear-regression-dummy-variable-trap-8964a83516d9

In [22]:
states = pd.get_dummies(X['State'], drop_first=True)
states.head()

Unnamed: 0,Florida,New York
0,0,1
1,0,0
2,1,0
3,0,1
4,1,0


In [23]:
X = X.drop('State', axis=1)
X.head()

Unnamed: 0,R&D Spend,Administration,Marketing Spend
0,165349.2,136897.8,471784.1
1,162597.7,151377.59,443898.53
2,153441.51,101145.55,407934.54
3,144372.41,118671.85,383199.62
4,142107.34,91391.77,366168.42


In [24]:
X = pd.concat([X, states], axis=1)
X.head()

Unnamed: 0,R&D Spend,Administration,Marketing Spend,Florida,New York
0,165349.2,136897.8,471784.1,0,1
1,162597.7,151377.59,443898.53,0,0
2,153441.51,101145.55,407934.54,1,0
3,144372.41,118671.85,383199.62,0,1
4,142107.34,91391.77,366168.42,1,0


- 5 different coefficients / features will be there in the linear regression equation
- Formula => B0 + B1 x1 + B2 x2 + B3 x3 + B4 x4 + B5 * x5
- B0 = index, x1 = R&D Spend, x2 = Administration, x3 = Marketing Spend, x4 = Florida, x5 = New York

In [25]:
# Split data into Training and Testing data and perform linear Regression and get the predictions

In [26]:
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0)
model = LinearRegression()
model.fit(X_train, y_train)

In [27]:
y_pred = model.predict(X_test)
y_pred

array([103015.20159796, 132582.27760816, 132447.73845174,  71976.09851258,
       178537.48221055, 116161.24230165,  67851.69209676,  98791.73374687,
       113969.43533012, 167921.0656955 ])

In [28]:
y_test

28    103282.38
11    144259.40
10    146121.95
41     77798.83
2     191050.39
27    105008.31
38     81229.06
31     97483.56
22    110352.25
4     166187.94
Name: Profit, dtype: float64

Compare the predicted values with the real values using the R-squared value
- Formula for r-squared => 1 - (sum of residual / sum of mean)
- Sum of residual = mean(sum(y_test - y_pred)^2))
- Sum of mean = mean(sum(y_test - y_mean)^2))

- (Sum of mean > Sum of residual) always if the model is very good

- r2 value => range between 0.8 to 0.98

- r2 value => the more nearer to 1 => the better the model

In [30]:
from sklearn.metrics import r2_score
score = r2_score(y_test, y_pred)
score

0.9347068473282423

R2 score -> 0.93 which is very near to 1 thus showing that the model used is very good model