In [2]:
#import libraries

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from pandas.plotting import scatter_matrix
from sklearn.linear_model import LinearRegression
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import PolynomialFeatures
from sklearn.metrics import mean_squared_error, r2_score
import warnings
warnings.filterwarnings('ignore')

In [3]:
StartUp=pd.read_csv('50_Startups.csv')
StartUp.head()

Unnamed: 0,R&D Spend,Administration,Marketing Spend,State,Profit
0,165349.2,136897.8,471784.1,New York,192261.83
1,162597.7,151377.59,443898.53,California,191792.06
2,153441.51,101145.55,407934.54,Florida,191050.39
3,144372.41,118671.85,383199.62,New York,182901.99
4,142107.34,91391.77,366168.42,Florida,166187.94


In [4]:
#Convert the categorical values into numerical values using label encoding

from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()
StartUp['State'] = le.fit_transform(StartUp['State'])
StartUp.head()

Unnamed: 0,R&D Spend,Administration,Marketing Spend,State,Profit
0,165349.2,136897.8,471784.1,2,192261.83
1,162597.7,151377.59,443898.53,0,191792.06
2,153441.51,101145.55,407934.54,1,191050.39
3,144372.41,118671.85,383199.62,2,182901.99
4,142107.34,91391.77,366168.42,1,166187.94


In [5]:
# Drop Profit from StartUp and store in variable Y
X = StartUp.drop(columns=['Profit'])
y = StartUp['Profit']

In [6]:
#check the rows and column in both x and y
print("Shape of X:", X.shape)
print("Shape of y:", y.shape)

Shape of X: (50, 4)
Shape of y: (50,)


In [7]:
#assign values of X and Y in different variables
X = X.values
y = y.values

In [8]:
#split the dataset for training and testing
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [9]:
# Multiple Linear Regression

MLR = LinearRegression()

# Train the model
MLR.fit(X_train, y_train)

y_pred_multiple = MLR.predict(X_test)

mse_multiple = mean_squared_error(y_test, y_pred_multiple)
r2_multiple = r2_score(y_test, y_pred_multiple)

print("\nMultiple Linear Regression")
print(f"  Mean Squared Error: {mse_multiple:.4f}")
print(f"  R² Score: {r2_multiple:.4f}")


Multiple Linear Regression
  Mean Squared Error: 80929465.4910
  R² Score: 0.9001


In [11]:
# Print the intercept
print(round(MLR.intercept_,2))

54080.72


In [13]:
#predict the values of Y for 1st 5 rows
MLR.predict(X[0:5])

array([192405.36723938, 188372.32164914, 183282.8185662 , 174012.86095334,
       173529.22121764])

In [14]:
# Display the first few predicted Values and the actual values
predictions = pd.DataFrame({'Actual': y_test, 'Predicted': y_pred_multiple})
print(predictions.head())

      Actual      Predicted
0  134307.35  126720.661507
1   81005.76   84909.089619
2   99937.59   98890.318549
3   64926.08   46479.312402
4  125370.37  129113.183188


In [15]:
#serialization
#model persistance (saving and loading trained models)
import pickle

In [17]:
# save: 'with' is package deals with file handeling, wb- write
with open('model.pkl','wb') as f:
    pickle.dump(MLR,f)

In [18]:
# load: rb-read
with open('model.pkl', 'rb') as f:
    clf2 = pickle.load(f)

In [19]:
#model created as clf2 and values are passed into it
clf2.predict(X[0:5])

array([192405.36723938, 188372.32164914, 183282.8185662 , 174012.86095334,
       173529.22121764])

# Interpretation - Multiple Linear Regression Analysis

**Overview of the Model**: A Multiple Linear Regression is developed to predict the Profit of the StartUp based on R&D Spend, Administration, Marketing Spend, and State. The dataset was  pre-processed by converting categorical variables (State) into numerical format to ensure compatibility with the regression model.

**Model Parameters**

(i) **R-Squared value = 0.9001**; The R-Squared Value suggests that the independent variables collectively explain 90.01% of the variability in Profit. This indicates a strong relationship between the predictors and the target variable.

(ii)**Intercept (𝛽0) = 54080.72**, the intercept indicates that if all independent variables (R&D Spend, Administration, Marketing Spend, and the encoded State variables) are zero, the model predicts a baseline profit of approximately 54080.72 units.

(iii) **Mean Squared Error = 80929465.4910**, The MSE value represents the average squared difference between the actual and predicted profits. Generally, a lower MSE value is preferred for the model.

