In [24]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import PolynomialFeatures
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.preprocessing import StandardScaler #For feature scaling


In [25]:
mydata=pd.read_csv("/content/50_Startups.csv")

In [26]:
mydata.head()

Unnamed: 0,R&D Spend,Administration,Marketing Spend,State,Profit
0,165349.2,136897.8,471784.1,New York,192261.83
1,162597.7,151377.59,443898.53,California,191792.06
2,153441.51,101145.55,407934.54,Florida,191050.39
3,144372.41,118671.85,383199.62,New York,182901.99
4,142107.34,91391.77,366168.42,Florida,166187.94


In [27]:
X = mydata.drop(columns=['Profit'])
y = mydata['Profit']

In [28]:
print("Shape of X:", X.shape)
print("Shape of y:", y.shape)

Shape of X: (50, 4)
Shape of y: (50,)


In [29]:
X = X.values
y = y.values

In [35]:
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=42)

In [36]:
print("Shape of x:", x_train.shape)
print("Shape of y:", y_train.shape)

Shape of x: (40, 5)
Shape of y: (40,)


In [37]:
#Feature scaling
scaler = StandardScaler()
x_train = scaler.fit_transform(x_train)
x_test = scaler.transform(x_test)

In [38]:
# Assuming 'State' was already processed and columns like 'State_California', 'State_Florida' etc. exist
# If not and 'State' exists in your original data, skip this step and move to the next one

# Get a list of columns that start with 'State_'
state_columns = [col for col in mydata.columns if col.startswith('State_')]

# Use these columns for dummy encoding if they exist
if state_columns:
  mydata = pd.get_dummies(mydata, columns=state_columns, drop_first=True)
else:
  # If 'State' column still exists and wasn't processed before, use this
  mydata = pd.get_dummies(mydata, columns=['State'], drop_first=True)

In [39]:
#Separate features (x) and target(y)
x=mydata.drop(columns=['Profit'])
y=mydata['Profit']
# =======================================
# Multiple Linear Regression
# =======================================
model_multiple = LinearRegression()

# Train the model
model_multiple.fit(x_train, y_train)

y_pred_multiple = model_multiple.predict(x_test)

mse_multiple = mean_squared_error(y_test, y_pred_multiple)
r2_multiple = r2_score(y_test, y_pred_multiple)

print("\nMultiple Linear Regression")
print(f"  Mean Squared Error: {mse_multiple:.4f}")
print(f"  R² Score: {r2_multiple:.4f}")


Multiple Linear Regression
  Mean Squared Error: 82010363.0450
  R² Score: 0.8987


In [40]:
print(model_multiple.intercept_)

115651.72050000001


In [50]:
type(model_multiple)

In [43]:
import pandas as pd
# Display the first few predictions alongside the actual values
predictions = pd.DataFrame({'Actual': y_test, 'Predicted': y_pred_multiple})
print(predictions.head())

       Actual      Predicted
13  134307.35  126362.879083
39   81005.76   84608.453836
30   99937.59   99677.494252
45   64926.08   46357.460686
17  125370.37  128750.482885


Interpretation:
the largest discrepancy occurs at index 45:
Actual: 64,926.08
Predicted: 46,357.46
This suggests a potential issue in the model for lower actual values. It might indicate a bias where the model struggles with lower data points.

At index 30, the prediction is almost identical to the actual value (99,937.59 vs. 99,677.49). This indicates that the model is capable of producing highly accurate results under certain conditions.
The R² score of 0.8987 indicates that about 89.87% of the variance in the target variable (actual values) is explained by the predictors in the model.
This is a strong result, showing that the model captures most of the underlying relationship between predictors and the target variable. However, there is still about 10.13% of variance not explained, which might be due to noise, omitted variables, or non-linear patterns.
The model both overestimates and underestimates. In rows 2 and 5, the predicted values exceed the actual values, while in rows 1 and 4, the predictions fall short.