Step-by-Step Implementation
 - Import Libraries
 - Load and Prepare Data
 - Split Data into Training and Testing Sets
 - Fit the Multiple Linear Regression Model
 - Evaluate the Model
 - Visualize Results

1. Import Libraries

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, r2_score

ModuleNotFoundError: No module named 'numpy'

2. Load and Prepare Data

In [None]:
# Load data
data = pd.read_csv('data.csv')

# Define dependent and independent variables
X = data[['X1', 'X2', 'X3']]  # Replace with your actual column names
Y = data['Y']


3. Split Data into Training and Testing Sets

In [None]:
# Split the data into training and testing sets
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.2, random_state=42)

4. Fit the Multiple Linear Regression Model

In [None]:
# Create the model
model = LinearRegression()

# Fit the model
model.fit(X_train, Y_train)

# Predict on the test set
Y_pred = model.predict(X_test)


5. Evaluate the Model

In [None]:
# Calculate Mean Squared Error
mse = mean_squared_error(Y_test, Y_pred)
print(f'Mean Squared Error: {mse}')

# Calculate R-squared
r2 = r2_score(Y_test, Y_pred)
print(f'R-squared: {r2}')

# Print model coefficients
print('Intercept:', model.intercept_)
print('Coefficients:', model.coef_)

6. Visualize Results

In [None]:
# Residuals vs. Fitted Values
residuals = Y_test - Y_pred
plt.scatter(Y_pred, residuals)
plt.xlabel('Fitted Values')
plt.ylabel('Residuals')
plt.title('Residuals vs. Fitted Values')
plt.axhline(y=0, color='red', linestyle='--')
plt.show()

Q-Q Plot

In [None]:
import statsmodels.api as sm
# Q-Q Plot
sm.qqplot(residuals, line='45')
plt.title('Q-Q Plot')
plt.show()

Cook's Distance Plot

In [None]:
# Since Cook's distance is not directly available in scikit-learn, we use statsmodels
import statsmodels.api as sm

# Add a constant to the model (intercept)
X_train_const = sm.add_constant(X_train)

# Fit the model using statsmodels
model_sm = sm.OLS(Y_train, X_train_const).fit()

# Get influence measures
influence = model_sm.get_influence()

# Cook's Distance
(c, p) = influence.cooks_distance

plt.figure()
plt.stem(np.arange(len(c)), c, markerfmt=",", use_line_collection=True)
plt.xlabel('Observation Index')
plt.ylabel("Cook's Distance")
plt.title("Cook's Distance Plot")
plt.show()
