In [2]:

import numpy as np
import pandas as pd
import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))


In [3]:
# Importing necessary libraries
import numpy as np
import pandas as pd

In [4]:
# Importing the dataset
df = pd.read_csv('/content/50_Startups (2).csv')
print(df.head())

   R&D Spend  Administration  Marketing Spend       State     Profit
0  165349.20       136897.80        471784.10    New York  192261.83
1  162597.70       151377.59        443898.53  California  191792.06
2  153441.51       101145.55        407934.54     Florida  191050.39
3  144372.41       118671.85        383199.62    New York  182901.99
4  142107.34        91391.77        366168.42     Florida  166187.94


In [5]:
# Seperating the features(X) and the target(y)
X = df.drop('Profit', axis=1)
y = df['Profit'].to_numpy()

print(X[:5])
print(y[:5])

   R&D Spend  Administration  Marketing Spend       State
0  165349.20       136897.80        471784.10    New York
1  162597.70       151377.59        443898.53  California
2  153441.51       101145.55        407934.54     Florida
3  144372.41       118671.85        383199.62    New York
4  142107.34        91391.77        366168.42     Florida
[192261.83 191792.06 191050.39 182901.99 166187.94]


In [6]:
# Checking for null values
print(df.isnull().sum())

R&D Spend          0
Administration     0
Marketing Spend    0
State              0
Profit             0
dtype: int64


In [7]:
# Plotting with box plot for distribution
import plotly.graph_objects as go
from plotly.subplots import make_subplots

x = ['R&D Spend', 'Administration', 'Marketing Spend', 'Profit']

fig_one = make_subplots(rows=2, cols=2, subplot_titles=x)
positions = [(1,1), (1,2), (2,1), (2,2)]

for i, pos in zip(x, positions):
    trace = go.Box(y=df[i], name=i)
    fig_one.add_trace(trace, row=pos[0], col=pos[1])

fig_one.update_layout(height=600, width=800, title_text="Box Plots for Distributions")
fig_one.show()


In [8]:
# Encoding categorical data
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer

ct = ColumnTransformer(transformers=[('encoder', OneHotEncoder(), ['State'])], remainder='passthrough' )
X_encoded = np.array(ct.fit_transform(X))
print(X_encoded[:5])

[[0.0000000e+00 0.0000000e+00 1.0000000e+00 1.6534920e+05 1.3689780e+05
  4.7178410e+05]
 [1.0000000e+00 0.0000000e+00 0.0000000e+00 1.6259770e+05 1.5137759e+05
  4.4389853e+05]
 [0.0000000e+00 1.0000000e+00 0.0000000e+00 1.5344151e+05 1.0114555e+05
  4.0793454e+05]
 [0.0000000e+00 0.0000000e+00 1.0000000e+00 1.4437241e+05 1.1867185e+05
  3.8319962e+05]
 [0.0000000e+00 1.0000000e+00 0.0000000e+00 1.4210734e+05 9.1391770e+04
  3.6616842e+05]]


In [9]:
# Splitting the dataset into the training set and the test set
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X_encoded, y, test_size=0.2, random_state=42)

print(X_train.shape)
print(X_test.shape)
print(y_train.shape)
print(y_test.shape)

(40, 6)
(10, 6)
(40,)
(10,)


In [10]:
# Building and training the linear regression model
from sklearn.linear_model import LinearRegression

regressor = LinearRegression()
regressor.fit(X_train, y_train)

In [11]:
# Predicting the test set results
y_pred = regressor.predict(X_test)
print(y_pred)

[126362.87908251  84608.45383642  99677.49425154  46357.4606858
 128750.48288498  50912.41741902 109741.350327   100643.24281643
  97599.27574598 113097.42524436]


In [12]:
# Comaparing the predicted target values and observed target values
np.set_printoptions(precision=2)

cat_targets = np.concatenate((y_pred.reshape(len(y_pred), 1), (y_test.reshape(len(y_test), 1))), axis=1)
print(cat_targets)

[[126362.88 134307.35]
 [ 84608.45  81005.76]
 [ 99677.49  99937.59]
 [ 46357.46  64926.08]
 [128750.48 125370.37]
 [ 50912.42  35673.41]
 [109741.35 105733.54]
 [100643.24 107404.34]
 [ 97599.28  97427.84]
 [113097.43 122776.86]]


In [13]:
# Plotting the predicted target values vs observed target values
import plotly.express as px

data = {
    'y_pred': y_pred,
    'y_test': y_test
}

df_targets = pd.DataFrame(data)
df_targets['difference'] = abs(df_targets['y_pred'] - df_targets['y_test'])

fig_two = px.scatter(df_targets,
                    x='y_test',
                    y='y_pred',
                    title = 'Observed Profits vs Predicted Profits',
                    size='difference',
                    color='difference',
                    color_continuous_scale='emrld',
                    trendline='ols',
                    trendline_color_override="forestgreen")

fig_two.show()

In [14]:
# Making a single prediction (R&D Spend = 250000, Administration = 150000, Marketing = 600000, State = New York)
y_pred_new = regressor.predict([[0, 0, 1, 250000, 150000, 600000]])
print(f'Profit: {y_pred_new}')

Profit: [263037.63]


In [15]:
# Getting the Multiple Linear Regression Formula
reg_coef = regressor.coef_
reg_intercept = regressor.intercept_

features = ['Dummy State 1', 'Dummy State 2', 'Dummy State 3', 'R&D Spend', 'Administration', 'Marketing']
formula = f'Profit(y) = {reg_intercept:.2f}'

for i in range(len(reg_coef)):
    formula += f' + ({reg_coef[i]:.2f} x {features[i]})'

print(formula)

Profit(y) = 54343.30 + (-315.26 x Dummy State 1) + (623.53 x Dummy State 2) + (-308.27 x Dummy State 3) + (0.81 x R&D Spend) + (-0.07 x Administration) + (0.03 x Marketing)
