### 3. Model Building and Training
#### Task 3: Model Training

Notebook: notebooks/Model_Training.ipynb
Steps:
- Choose appropriate features for the model.
- Train a linear regression model.
- Perform hyperparameter tuning (if applicable).

- Script: scripts/train_model.py


###Import Libraries

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, r2_score


###Load the Preprocessed Dataset

In [12]:
# # Load dataset
file_path = (r"D:\Assignment\Machine Lerning\machine-learning-introduction-Sabuna-Gamal\Data\BostonHousing.csv") 
df = pd.read_csv(file_path)  # Read CSV into a DataFrame

# Load the preprocessed dataset
X_test = pd.read_csv(r"D:\Assignment\Machine Lerning\machine-learning-introduction-Sabuna-Gamal\Data\X_test.csv")
X_train = pd.read_csv(r"D:\Assignment\Machine Lerning\machine-learning-introduction-Sabuna-Gamal\Data\X_train.csv")
y_test = pd.read_csv(r"D:\Assignment\Machine Lerning\machine-learning-introduction-Sabuna-Gamal\Data\y_test.csv")
y_train = pd.read_csv(r"D:\Assignment\Machine Lerning\machine-learning-introduction-Sabuna-Gamal\Data\y_train.csv")

# Display the first few rows
print(X_test.head())
print(X_train.head())
print(y_test.head())
print(y_train.head())


       crim        zn     indus      chas       nox        rm       age  \
0 -0.236041  0.871318 -0.501726 -0.260378 -1.073076  0.439527 -0.440185   
1 -0.214499  0.237584 -0.144178 -0.260378  0.338419 -0.580839  0.940816   
2  0.089608 -0.596277  2.349763 -0.260378  1.856155 -0.930475  1.402401   
3  2.283708  0.737901 -0.837928 -0.260378  2.205235  2.565886  1.263550   
4 -0.332314 -0.596277 -0.823697 -0.260378  0.125936 -1.608341  1.038387   

        dis       rad       tax   ptratio         b     lstat  
0  1.951028  1.484407  0.243996  0.358605 -2.474124 -0.268091  
1  1.253798  0.281608 -0.003673 -1.785172 -0.692920  1.404934  
2 -1.527156 -0.319792  1.638760  1.512946  0.098231  2.336354  
3 -1.295462  0.281608 -0.616326 -2.994482  0.303070  0.113747  
4 -1.003274  0.281608 -0.199200 -1.015611  0.819619  0.873009  
       crim        zn     indus      chas       nox        rm       age  \
0  0.569941 -0.596277  0.339668  3.840573 -0.192789 -2.007925  1.042140   
1 -0.561767 -0.

In [17]:
# Check for missing values
print("Missing values before handling:\n", df.isnull().sum())

# Drop rows with NaN values (only if few values are missing)
df = df.dropna()

Missing values before handling:
 crim       0
zn         0
indus      0
chas       0
nox        0
rm         5
age        0
dis        0
rad        0
tax        0
ptratio    0
b          0
lstat      0
medv       0
dtype: int64


In [18]:
# Define features (X) and target variable (y)
X = df.drop(columns=['medv'])  # Features (remove the target column)
y = df['medv']  # Target variable (house price)


 ###Split Data into Training and Testing Sets

In [19]:
# Split dataset into training and testing sets (80% training, 20% testing)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

print("Training set size:", X_train.shape)
print("Testing set size:", X_test.shape)


Training set size: (400, 13)
Testing set size: (101, 13)


### Train a Linear Regression Model

In [20]:
# Initialize the model
model = LinearRegression()

# Train (fit) the model
model.fit(X_train, y_train)

# Print model coefficients
print("Model Coefficients:", model.coef_)
print("Model Intercept:", model.intercept_)


Model Coefficients: [-9.68696752e-02  4.39366725e-02  4.06879802e-02  2.21236621e+00
 -1.45083232e+01  3.97614280e+00  9.84686278e-03 -1.35437909e+00
  2.93898795e-01 -1.21174924e-02 -8.68721190e-01  1.28288751e-02
 -6.10251464e-01]
Model Intercept: 30.626081825846626


### Make predictions

In [21]:
# Make predictions on the test set
y_pred = model.predict(X_test)

# Print first five predictions
print("Predicted Prices:", y_pred[:5])


Predicted Prices: [10.18296392 22.90028638 15.7250715  32.8687425  23.06305477]


### Evaluate Model Performance

In [22]:
# Calculate Mean Squared Error (MSE)
mse = mean_squared_error(y_test, y_pred)

# Calculate R-squared (R²) Score
r2 = r2_score(y_test, y_pred)

print("Mean Squared Error:", mse)
print("R-squared Score:", r2)


Mean Squared Error: 20.687720473048508
R-squared Score: 0.7200277678580314


Hyperparameter Tuning 

In [23]:
from sklearn.preprocessing import PolynomialFeatures

# Transform features to include polynomial terms (degree=2)
poly = PolynomialFeatures(degree=2)
X_train_poly = poly.fit_transform(X_train)
X_test_poly = poly.transform(X_test)

# Train a new model with polynomial features
poly_model = LinearRegression()
poly_model.fit(X_train_poly, y_train)

# Evaluate new model
y_pred_poly = poly_model.predict(X_test_poly)
print("MSE with Polynomial Features:", mean_squared_error(y_test, y_pred_poly))
print("R² Score with Polynomial Features:", r2_score(y_test, y_pred_poly))


MSE with Polynomial Features: 12.45895834024775
R² Score with Polynomial Features: 0.831389718300416


In [24]:
import joblib

# Save the trained model
joblib.dump(model, "linear_regression_model.pkl")

# Load model (for later use)
loaded_model = joblib.load("linear_regression_model.pkl")
