# Step 1: Necessary Library Imports

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

# Step 2: Reading the data from .csv to a dataframe

In [2]:
df = pd.read_csv('train.csv')

### Meta-data
**Housing Values in Suburbs of Boston**  
The `medv` variable is the target variable.

**Data Description**  
The Boston data frame has 506 rows and 14 columns.

This data frame contains the following columns:

- **crim**  
  Per capita crime rate by town.

- **zn**  
  Proportion of residential land zoned for lots over 25,000 sq.ft.

- **indus**  
  Proportion of non-retail business acres per town.

- **chas**  
  Charles River dummy variable (= 1 if tract bounds river; 0 otherwise).

- **nox**  
  Nitrogen oxides concentration (parts per 10 million).

- **rm**  
  Average number of rooms per dwelling.

- **age**  
  Proportion of owner-occupied units built prior to 1940.

- **dis**  
  Weighted mean of distances to five Boston employment centres.

- **rad**  
  Index of accessibility to radial highways.

- **tax**  
  Full-value property-tax rate per $10,000.

- **ptratio**  
  Pupil-teacher ratio by town.

- **black**  
  \(1000(Bk - 0.63)^2\) where Bk is the proportion of blacks by town.

- **lstat**  
  Lower status of the population (percent).

- **medv**  
  Median value of owner-occupied homes in $1000s.


In [3]:
df.head()

Unnamed: 0,ID,crim,zn,indus,chas,nox,rm,age,dis,rad,tax,ptratio,black,lstat,medv
0,1,0.00632,18.0,2.31,0,0.538,6.575,65.2,4.09,1,296,15.3,396.9,4.98,24.0
1,2,0.02731,0.0,7.07,0,0.469,6.421,78.9,4.9671,2,242,17.8,396.9,9.14,21.6
2,4,0.03237,0.0,2.18,0,0.458,6.998,45.8,6.0622,3,222,18.7,394.63,2.94,33.4
3,5,0.06905,0.0,2.18,0,0.458,7.147,54.2,6.0622,3,222,18.7,396.9,5.33,36.2
4,7,0.08829,12.5,7.87,0,0.524,6.012,66.6,5.5605,5,311,15.2,395.6,12.43,22.9


In [4]:
any_nulls = df.isnull().values.any()
print(any_nulls)  # Returns True if there are any null values, False otherwise


False


In [5]:
df.drop('ID', axis=1, inplace=True)

In [6]:
df.head()

Unnamed: 0,crim,zn,indus,chas,nox,rm,age,dis,rad,tax,ptratio,black,lstat,medv
0,0.00632,18.0,2.31,0,0.538,6.575,65.2,4.09,1,296,15.3,396.9,4.98,24.0
1,0.02731,0.0,7.07,0,0.469,6.421,78.9,4.9671,2,242,17.8,396.9,9.14,21.6
2,0.03237,0.0,2.18,0,0.458,6.998,45.8,6.0622,3,222,18.7,394.63,2.94,33.4
3,0.06905,0.0,2.18,0,0.458,7.147,54.2,6.0622,3,222,18.7,396.9,5.33,36.2
4,0.08829,12.5,7.87,0,0.524,6.012,66.6,5.5605,5,311,15.2,395.6,12.43,22.9


In [7]:
X = df.drop('medv', axis=1)
y = df['medv']

In [8]:
X

Unnamed: 0,crim,zn,indus,chas,nox,rm,age,dis,rad,tax,ptratio,black,lstat
0,0.00632,18.0,2.31,0,0.538,6.575,65.2,4.0900,1,296,15.3,396.90,4.98
1,0.02731,0.0,7.07,0,0.469,6.421,78.9,4.9671,2,242,17.8,396.90,9.14
2,0.03237,0.0,2.18,0,0.458,6.998,45.8,6.0622,3,222,18.7,394.63,2.94
3,0.06905,0.0,2.18,0,0.458,7.147,54.2,6.0622,3,222,18.7,396.90,5.33
4,0.08829,12.5,7.87,0,0.524,6.012,66.6,5.5605,5,311,15.2,395.60,12.43
...,...,...,...,...,...,...,...,...,...,...,...,...,...
328,0.17783,0.0,9.69,0,0.585,5.569,73.5,2.3999,6,391,19.2,395.77,15.10
329,0.06263,0.0,11.93,0,0.573,6.593,69.1,2.4786,1,273,21.0,391.99,9.67
330,0.04527,0.0,11.93,0,0.573,6.120,76.7,2.2875,1,273,21.0,396.90,9.08
331,0.06076,0.0,11.93,0,0.573,6.976,91.0,2.1675,1,273,21.0,396.90,5.64


In [9]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.20, random_state=42)

ModuleNotFoundError: No module named 'sklearn'

In [10]:
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
import numpy as np

# Initialize and fit the Linear Regression model
model = LinearRegression()
try:
    model.fit(X_train, y_train)
except Exception as e:
    print(f"Error during model training: {e}")

# Evaluate the model on training data
train_score = model.score(X_train, y_train)
print(f"Training R² Score: {train_score:.4f}")

# Evaluate the model on test data
test_score = model.score(X_test, y_test)
print(f"Test R² Score: {test_score:.4f}")

# Make predictions on the test data
try:
    y_pred = model.predict(X_test)
except Exception as e:
    print(f"Error during prediction: {e}")

# Calculate evaluation metrics
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
mae = mean_absolute_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

# Display evaluation metrics
print(f"Mean Squared Error (MSE): {mse:.4f}")
print(f"Root Mean Squared Error (RMSE): {rmse:.4f}")
print(f"Mean Absolute Error (MAE): {mae:.4f}")
print(f"R² Score (Test): {r2:.4f}")


ModuleNotFoundError: No module named 'sklearn'

In [57]:
from sklearn.feature_selection import RFE
from sklearn.model_selection import KFold, GridSearchCV
from sklearn.linear_model import LinearRegression

# Set up cross-validation with 5 folds
folds = KFold(n_splits=5, shuffle=True, random_state=100)

# Specify range of hyperparameters: Number of features to select
# Adjust the range based on the number of features in your dataset
n_features_list = list(range(2, min(40, X_train.shape[1] + 1)))
hyper_params = [{'n_features_to_select': n_features_list}]

# Initialize linear regression model
lm = LinearRegression()

# Initialize RFE with the linear regression model
# Optionally set 'step' to control the number of features removed at each iteration
rfe = RFE(estimator=lm, step=1)

# Set up GridSearchCV with RFE and cross-validation
model_cv = GridSearchCV(estimator=rfe,
                        param_grid=hyper_params,
                        scoring='r2',
                        cv=folds,
                        verbose=2,  # Set to 1 or 2 for more detailed output
                        return_train_score=True,
                        n_jobs=-1)  # Use all available cores for parallel processing

# Fit the GridSearchCV model to the training data
model_cv.fit(X_train, y_train)

# Display the best number of features and the best score
print(f"Best number of features: {model_cv.best_params_['n_features_to_select']}")
print(f"Best cross-validation R2 score: {model_cv.best_score_:.4f}")


Fitting 5 folds for each of 12 candidates, totalling 60 fits
Best number of features: 10
Best cross-validation R2 score: 0.6364


In [61]:
model_cv.score(X_test, y_test)

0.7184453380156279

# Optional Step: Implementing Ridge Regression with Cross Validation

In [None]:
from sklearn.linear_model import RidgeCV
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
from sklearn.preprocessing import StandardScaler
import numpy as np

# Standardize the features (important for regularized regression models)
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# Initialize and fit RidgeCV with a range of alpha values and cross-validation
# Specifying a wider range of alphas and more cross-validation folds
alphas = np.logspace(-4, 2, 50)  # Wider range of alpha values
clf = RidgeCV(alphas=alphas, cv=5, scoring='r2').fit(X_train, y_train)

# Evaluate the model on the test data
test_score = clf.score(X_test, y_test)
print(f"Test R² Score: {test_score:.4f}")

# Make predictions on the test data
y_pred = clf.predict(X_test)

# Calculate evaluation metrics
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
mae = mean_absolute_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

# Display evaluation metrics and selected alpha
print(f"Selected Alpha: {clf.alpha_}")
print(f"Mean Squared Error (MSE): {mse:.4f}")
print(f"Root Mean Squared Error (RMSE): {rmse:.4f}")
print(f"Mean Absolute Error (MAE): {mae:.4f}")
print(f"R² Score (Test): {r2:.4f}")

