In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder
from sklearn.linear_model import LinearRegression

In [None]:
df_start=pd.read_csv(r'C:\Users\ronak\Downloads\Stat\50_startups.csv')
df_start.head()

In [None]:
df_start.describe()

In [None]:
plt.title('Profit Distribution Plot')
sns.distplot(df_start['Profit'])
plt.show()

In [None]:
plt.scatter(df_start['R&D Spend'],df_start['Profit'],color='lightcoral')
plt.title('R&D Spend vs Profit')
plt.xlabel('R&D Spend')
plt.ylabel('Profit')
plt.box(False)
plt.show()

In [None]:
x=df_start.iloc[:,:-1].values
y=df_start.iloc[:,-1].values

In [None]:
#one hot encoding categorical variable
ct=ColumnTransformer(transformers=[('encoder',OneHotEncoder(),[3])],remainder='passthrough')
x=np.array(ct.fit_transform(x))

# Check for Multicollinearity:

In [None]:
from statsmodels.stats.outliers_influence import variance_inflation_factor

vif_data = pd.DataFrame()
vif_data["feature"] = df_start.columns[:-1]  # or use appropriate column names
vif_data["VIF"] = [variance_inflation_factor(x, i) for i in range(x.shape[1])]

print(vif_data)

# Check Residuals for Homoscedasticity:

In [None]:
residuals = y_test - y_pred
plt.scatter(y_pred, residuals)
plt.xlabel('Predicted Profit')
plt.ylabel('Residuals')
plt.title('Residuals vs Predicted Profit')
plt.show()

# Check Residuals for Normality:

In [None]:
import statsmodels.api as sm
sm.qqplot(residuals, line='45')
plt.show()

# Check for Independence of Errors:

In [None]:
from statsmodels.stats.stattools import durbin_watson
dw = durbin_watson(residuals)
print(f'Durbin-Watson: {dw}')

In [None]:
x_train,x_test,y_train,y_test=train_test_split(x,y,test_size=0.2,random_state=0)

In [None]:
regressor=LinearRegression()
regressor.fit(x_train,y_train)

In [None]:
#Predict result
y_pred=regressor.predict(x_test)

As we have multiple independent variables, we cannot plot the graph to compare results visually, instead, we can compare a few records of predicted results and actual values side by side.

In [None]:
#compare predicted result with actual value
np.set_printoptions(precision=2)
result=np.concatenate((y_pred.reshape(len(y_pred),1),y_test.reshape(len(y_test),1)),1)
result

Coefficients
We can get the coefficients and intercept of multiple linear regression equation y = β0 + β1x1 + β2x2 + β3x3 + … + βnxn from the regressor model.

In [None]:
print(f'Coefficient:{regressor.coef_}')
print(f'Intercept:{regressor.intercept_}')

# Model validation

Cross-Validation: Helps ensure that the model generalizes well to unseen data

In [None]:
from sklearn.model_selection import cross_val_score
scores = cross_val_score(regressor, x, y, cv=10, scoring='neg_mean_squared_error')
print(f'Cross-Validation Scores: {-scores}')
print(f'Mean MSE: {-scores.mean()}')

Performance Metrics:
Mean Squared Error (MSE): Lower MSE indicates better model performance.
Mean Absolute Error (MAE): Lower MAE indicates better model performance.
R-squared (R²): Values closer to 1 indicate that a higher proportion 
    of the variance in the dependent variable is explained by the model.

In [None]:
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score

mse = mean_squared_error(y_test, y_pred)
mae = mean_absolute_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

print(f'Mean Squared Error: {mse}')
print(f'Mean Absolute Error: {mae}')
print(f'R-squared: {r2}')

Learning Curves: Shows how the model's performance changes with the size of the training data

In [None]:
from sklearn.model_selection import learning_curve

train_sizes, train_scores, test_scores = learning_curve(regressor, x, y, cv=10, scoring='neg_mean_squared_error', train_sizes=np.linspace(0.1, 1.0, 10))

train_scores_mean = -np.mean(train_scores, axis=1)
test_scores_mean = -np.mean(test_scores, axis=1)

plt.plot(train_sizes, train_scores_mean, 'o-', color='r', label='Training error')
plt.plot(train_sizes, test_scores_mean, 'o-', color='g', label='Validation error')

plt.xlabel('Training Size')
plt.ylabel('Error')
plt.title('Learning Curves')
plt.legend(loc='best')
plt.show()

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder
from sklearn.linear_model import LinearRegression

df_start=pd.read_csv(r'C:\Users\ronak\Downloads\Stat\50_startups.csv')
df_start.head()

plt.title('Profit Distribution Plot')
sns.distplot(df_start['Profit'])
plt.show()

plt.scatter(df_start['R&D Spend'],df_start['Profit'],color='lightcoral')
plt.title('R&D Spend vs Profit')
plt.xlabel('R&D Spend')
plt.ylabel('Profit')
plt.box(False)
plt.show()

x=df_start.iloc[:,:-1].values
y=df_start.iloc[:,-1].values

#one hot encoding categorical variable
ct=ColumnTransformer(transformers=[('encoder',OneHotEncoder(),[3])],remainder='passthrough')
x=np.array(ct.fit_transform(x))

x_train,x_test,y_train,y_test=train_test_split(x,y,test_size=0.2,random_state=0)

regressor=LinearRegression()
regressor.fit(x_train,y_train)

#Predict result
y_pred=regressor.predict(x_test)

#compare predicted result with actual value
np.set_printoptions(precision=2)
result=np.concatenate((y_pred.reshape(len(y_pred),1),y_test.reshape(len(y_test),1)),1)
result

print(f'Coefficient:{regressor.coef_}')
print(f'Intercept:{regressor.intercept_}')
