### Using the same data set of Civil_Engineering_Regression_Dataset.csv

Part 4: Multiple Linear Regression
12. Compare the R-squared values of simple and multiple linear regression. Which model performs better?
13. What does the Adjusted R-squared value indicate about the multiple regression model?
14. How does multicollinearity affect the model? Check Variance Inflation Factor (VIF) to detect multicollinearity.


In [4]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, r2_score
from statsmodels.stats.outliers_influence import variance_inflation_factor
import statsmodels.api as sm

# Load the dataset
df = pd.read_csv("Civil_Engineering_Regression_Dataset.csv")

# Trim whitespace from column names
df.columns = df.columns.str.strip()

# Display first few rows
print("First few rows of the dataset:")
print(df.head())

# Identify independent and dependent variables
dependent_variable = "Construction_Cost"
independent_variables = ["Building_Height", "Material_Quality_Index", "Labor_Cost", "Concrete_Strength", "Foundation_Depth"]

# Check if required columns exist
missing_columns = [col for col in independent_variables + [dependent_variable] if col not in df.columns]
if missing_columns:
    print("Error: The following required columns are missing:", missing_columns)
else:
    # Prepare data for regression
    X = df[independent_variables]  # Independent variables
    y = df[dependent_variable]  # Dependent variable
    
    # Split data into training and testing sets
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
    
    # Fit simple linear regression model (using Building Height only)
    simple_model = LinearRegression()
    simple_model.fit(X_train[["Building_Height"]], y_train)
    simple_r_squared = simple_model.score(X_test[["Building_Height"]], y_test)
    
    # Fit multiple linear regression model
    model = LinearRegression()
    model.fit(X_train, y_train)
    
    # Get regression coefficients
    intercept = model.intercept_
    coefficients = dict(zip(independent_variables, model.coef_))
    print("\nEquation of multiple regression model:")
    print(f"Construction_Cost = {intercept:.2f} " + " ".join([f"+ ({coeff:.2f} * {var})" for var, coeff in coefficients.items()]))
    
    # Identify the most impactful variable
    most_impactful_variable = max(coefficients, key=coefficients.get, default=None)
    print(f"\nVariable with highest impact: {most_impactful_variable} ({coefficients[most_impactful_variable]:.2f} impact on Construction_Cost)")
    
    # Evaluate model performance
    y_pred = model.predict(X_test)
    multiple_r_squared = r2_score(y_test, y_pred)
    mse = mean_squared_error(y_test, y_pred)
    
    print("\nModel Performance:")
    print(f"Simple Linear Regression R-squared: {simple_r_squared:.4f}")
    print(f"Multiple Linear Regression R-squared: {multiple_r_squared:.4f}")
    print(f"Mean Squared Error: {mse:.4f}")
    
    # Adjusted R-squared calculation
    n = X_test.shape[0]  # Number of samples
    p = X_test.shape[1]  # Number of independent variables
    adjusted_r_squared = 1 - ((1 - multiple_r_squared) * (n - 1) / (n - p - 1))
    print(f"\nAdjusted R-squared: {adjusted_r_squared:.4f}")
    
    # Check for multicollinearity using VIF
    X_with_const = sm.add_constant(X)  # Add constant for VIF calculation
    vif_data = pd.DataFrame()
    vif_data["Feature"] = X_with_const.columns
    vif_data["VIF"] = [variance_inflation_factor(X_with_const.values, i) for i in range(X_with_const.shape[1])]
    
    print("\nVariance Inflation Factor (VIF) for detecting multicollinearity:")
    print(vif_data)

First few rows of the dataset:
   Project_ID  Building_Height  Material_Quality_Index  Labor_Cost  \
0           1        21.854305                       9   70.213332   
1           2        47.782144                       9  142.413614   
2           3        37.939727                       3  110.539985   
3           4        31.939632                       6  250.784939   
4           5        12.020839                       7  167.575159   

   Concrete_Strength  Foundation_Depth  Weather_Index  Construction_Cost  
0          45.326394          8.804790              4        2400.287931  
1          47.900505          6.727632              6        3705.461312  
2          22.112484          8.208544              8        2653.631004  
3          26.267562          7.094515              4        2534.099466  
4          40.134306          6.160303              6        1741.179333  

Equation of multiple regression model:
Construction_Cost = -9.64 + (49.81 * Building_Height) + (1