In [25]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.linear_model import LinearRegression, LogisticRegression,Ridge,Lasso,ElasticNet
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor,GradientBoostingRegressor
from sklearn.metrics import accuracy_score,classification_report,confusion_matrix,mean_squared_error,r2_score,make_scorer
from sklearn.model_selection import cross_val_score,KFold

In [26]:
# Load the house price data
data = pd.read_csv('C:\\Users\\91934\\OneDrive\\Desktop\\GMU\\Ait 664\\Hands on session\\Boston-house-price-data.csv')
print(data.info())
print(data.describe())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 506 entries, 0 to 505
Data columns (total 14 columns):
 #   Column   Non-Null Count  Dtype  
---  ------   --------------  -----  
 0   CRIM     506 non-null    float64
 1   ZN       506 non-null    float64
 2   INDUS    506 non-null    float64
 3   CHAS     506 non-null    int64  
 4   NOX      506 non-null    float64
 5   RM       506 non-null    float64
 6   AGE      506 non-null    float64
 7   DIS      506 non-null    float64
 8   RAD      506 non-null    int64  
 9   TAX      506 non-null    float64
 10  PTRATIO  506 non-null    float64
 11  B        506 non-null    float64
 12  LSTAT    506 non-null    float64
 13  MEDV     506 non-null    float64
dtypes: float64(12), int64(2)
memory usage: 55.5 KB
None
             CRIM          ZN       INDUS        CHAS         NOX          RM  \
count  506.000000  506.000000  506.000000  506.000000  506.000000  506.000000   
mean     3.613524   11.363636   11.136779    0.069170    0.554695   

In [27]:
missing=data.isnull().sum()
missing

CRIM       0
ZN         0
INDUS      0
CHAS       0
NOX        0
RM         0
AGE        0
DIS        0
RAD        0
TAX        0
PTRATIO    0
B          0
LSTAT      0
MEDV       0
dtype: int64

In [28]:
data.dropna(inplace=True)
target= 'MEDV'

In [29]:
# all are in numerical columns so proceed
X = data.drop(target,axis=1)
y= data['MEDV']
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

In [30]:
X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, test_size=0.3, random_state=42)

In [31]:
# Initialize and train the Linear Regression model
linear_model = LinearRegression()
linear_model.fit(X_train, y_train)

# Predict on the testing set
y_pred_linear = linear_model.predict(X_test)

# Evaluate the Linear Regression model
print("Linear Regression Mean Squared Error:", mean_squared_error(y_test, y_pred_linear))
print("Linear Regression R^2 Score:", r2_score(y_test, y_pred_linear))

Linear Regression Mean Squared Error: 21.51744423117727
Linear Regression R^2 Score: 0.7112260057484925


In [32]:
rf_model = RandomForestRegressor(n_estimators=100, random_state=42)
rf_model.fit(X_train, y_train)

# Predict on the testing set
y_pred_rf = rf_model.predict(X_test)

# Evaluate the Random Forest Regressor
print("Random Forest Regressor Mean Squared Error:", mean_squared_error(y_test, y_pred_rf))
print("Random Forest Regressor R^2 Score:", r2_score(y_test, y_pred_rf))

Random Forest Regressor Mean Squared Error: 9.609646282894735
Random Forest Regressor R^2 Score: 0.8710341288379007


In [33]:
# Create a RandomForestRegressor with regularization parameters
regressor = RandomForestRegressor(
    n_estimators=100,          # Number of trees
    max_depth=15,              # Limit the depth of each tree
    min_samples_split=8,       # Minimum samples required to split an internal node
    min_samples_leaf=8,        # Minimum samples required to be at a leaf node
    max_features='sqrt',       # Use the square root of the total features at each split
    random_state=42
)

# Fit the model
regressor.fit(X_train, y_train)

# Make predictions and evaluate
y_pred = regressor.predict(X_test)
mse = mean_squared_error(y_test, y_pred)
print("Random Forest Regressor Mean Squared Error:", mean_squared_error(y_test, y_pred))
print("Random Forest Regressor R^2 Score:", r2_score(y_test, y_pred_rf))


Random Forest Regressor Mean Squared Error: 15.211648087327058
Random Forest Regressor R^2 Score: 0.8710341288379007


In [34]:
# Assuming X and y are your features and target variable
# Define the model with regularization parameters
regressor = RandomForestRegressor(
    n_estimators=100,
    max_depth=15,
    min_samples_split=8,
    min_samples_leaf=8,
    max_features='sqrt',
    random_state=42
)
# Set up k-fold cross-validation (8 folds)
kf = KFold(n_splits=8, shuffle=True, random_state=42)
# Perform cross-validation and calculate MSE for each fold
scores = cross_val_score(
    regressor, X, y, cv=kf, scoring=make_scorer(mean_squared_error)
)
# Calculate the mean and standard deviation of the MSE scores
mean_mse = np.mean(scores)
std_mse = np.std(scores)
print(f"Mean MSE from cross-validation: {mean_mse:.2f}")
print(f"R^2 Score: {r2_score(y_test, y_pred):.2f}")


Mean MSE from cross-validation: 16.56
R^2 Score: 0.80


In [35]:
#define the gradient boosting model with different parameters
gbm_model = GradientBoostingRegressor(n_estimators=300, learning_rate=0.1, max_depth=3, random_state=42)
gbm_model.fit(X_train, y_train)

# Predict on the testing set
y_pred_gbm = gbm_model.predict(X_test)

# Evaluate the Gradient Boosting Regressor
print("Gradient Boosting Regressor Mean Squared Error:", mean_squared_error(y_test, y_pred_gbm))
print("Gradient Boosting Regressor R^2 Score:", r2_score(y_test, y_pred_gbm))


Gradient Boosting Regressor Mean Squared Error: 7.757442405928452
Gradient Boosting Regressor R^2 Score: 0.8958915564195972


In [36]:
# L2 Regularization: Ridge Regression
ridge = Ridge(alpha=1.0)  # alpha controls the regularization strength; higher means more regularization
ridge.fit(X_train, y_train)
ridge_predictions = ridge.predict(X_test)
print("Ridge MSE:", mean_squared_error(y_test, ridge_predictions))
print("Ridge R^2 Score:", r2_score(y_test, ridge_predictions))

# L1 Regularization: Lasso Regression
lasso = Lasso(alpha=0.1)  # alpha is the regularization parameter
lasso.fit(X_train, y_train)
lasso_predictions = lasso.predict(X_test)
print("Lasso MSE:", mean_squared_error(y_test, lasso_predictions))
print("Lasso R^2 Score:", r2_score(y_test, lasso_predictions))

# L1 + L2 Regularization: Elastic Net
elastic_net = ElasticNet(alpha=0.1, l1_ratio=0.5)  # l1_ratio balances between L1 and L2 (0 = pure L2, 1 = pure L1)
elastic_net.fit(X_train, y_train)
elastic_net_predictions = elastic_net.predict(X_test)
print("Elastic Net MSE:", mean_squared_error(y_test, elastic_net_predictions))
print("Elastic Net R^2 Score:", r2_score(y_test, elastic_net_predictions))

Ridge MSE: 21.5485004029582
Ridge R^2 Score: 0.7108092176450825
Lasso MSE: 22.79791327777329
Lasso R^2 Score: 0.6940415224460941
Elastic Net MSE: 22.480044769402568
Elastic Net R^2 Score: 0.6983074639688323


Conclusion:
Linear Regression - 0.7112260057484925,
Random Forest Regressor - 0.8710341288379007,
Random Forest Regressor with regularization - 0.8710341288379007,
Gradient Boosting Regressor - 0.8958915564195972,
Ridge Regression - 0.7108092176450825,
Lasso Regression - 0.6940415224460941,
Elastic Regression - 0.6983074639688323,