In [6]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.preprocessing import StandardScaler
import pickle

# --- 1. Load the Dataset ---
# Assuming 'BostonHousing.csv' is in the same directory as your notebook
try:
    df = pd.read_csv('BostonHousing.csv')
    print("Dataset loaded successfully.")
    print(df.head())
    print("\nDataset Info:")
    df.info()
except FileNotFoundError:
    print("Error: BostonHousing.csv not found. Make sure the file is in the correct directory.")
    # Exit or handle the error appropriately if the file is not found
    exit()

# --- 2. Data Preprocessing ---
# Check for missing values
print("\nMissing values before handling:")
print(df.isnull().sum())

# Handle missing values (e.g., fill with median for numerical data)
# For 'crim', 'zn', 'indus', 'chas', 'nox', 'rm', 'age', 'dis', 'rad', 'tax', 'ptratio', 'b', 'lstat', 'price'
# Median imputation is often robust to outliers
for column in df.columns:
    if df[column].isnull().any():
        median_val = df[column].median()
        df[column].fillna(median_val, inplace=True)
        print(f"Filled missing values in '{column}' with median: {median_val}")

print("\nMissing values after handling:")
print(df.isnull().sum())

# Define features (X) and target (y)
# 'price' is the target variable (Median value of owner-occupied homes in $1000s)
X = df.drop('price', axis=1) # Corrected column name to 'price'
y = df['price']              # Corrected column name to 'price'

# Feature Scaling
# It's good practice to scale features for linear regression, especially if features have different ranges.
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)
X_scaled_df = pd.DataFrame(X_scaled, columns=X.columns) # Convert back to DataFrame for consistency

print("\nScaled Features (first 5 rows):")
print(X_scaled_df.head())

# --- 3. Model Training ---
# Split data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X_scaled_df, y, test_size=0.2, random_state=42)

print(f"\nTraining data shape: {X_train.shape}")
print(f"Testing data shape: {X_test.shape}")

# Initialize and train the Linear Regression model
model = LinearRegression()
model.fit(X_train, y_train)

print("\nLinear Regression model trained.")

# --- 4. Model Evaluation ---
# Make predictions on the test set
y_pred = model.predict(X_test)

# Evaluate the model
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
r2 = r2_score(y_test, y_pred)

print(f"\nModel Evaluation:")
print(f"Mean Squared Error (MSE): {mse:.2f}")
print(f"Root Mean Squared Error (RMSE): {rmse:.2f}")
print(f"R-squared (R2): {r2:.2f}")

# You can also print the coefficients and intercept
print("\nModel Coefficients:")
for feature, coef in zip(X.columns, model.coef_):
    print(f"{feature}: {coef:.4f}")
print(f"Intercept: {model.intercept_:.4f}")


# --- 5. Save the Model and Scaler ---
# It's important to save the scaler as well, as you'll need it to scale new input data for prediction
# in your Flask app exactly as you scaled the training data.
model_filename = 'linear_regression_model.pkl'
scaler_filename = 'scaler.pkl'

with open(model_filename, 'wb') as file:
    pickle.dump(model, file)
print(f"\nModel saved as '{model_filename}'")

with open(scaler_filename, 'wb') as file:
    pickle.dump(scaler, file)
print(f"Scaler saved as '{scaler_filename}'")

print("\nNotebook execution complete. You should now have 'linear_regression_model.pkl' and 'scaler.pkl' files.")

Dataset loaded successfully.
      crim    zn  indus  chas    nox     rm   age     dis  rad  tax  ptratio  \
0  0.00632  18.0   2.31     0  0.538  6.575  65.2  4.0900    1  296     15.3   
1  0.02731   0.0   7.07     0  0.469  6.421  78.9  4.9671    2  242     17.8   
2  0.02729   0.0   7.07     0  0.469  7.185  61.1  4.9671    2  242     17.8   
3  0.03237   0.0   2.18     0  0.458  6.998  45.8  6.0622    3  222     18.7   
4  0.06905   0.0   2.18     0  0.458  7.147  54.2  6.0622    3  222     18.7   

        b  lstat  price  
0  396.90   4.98   24.0  
1  396.90   9.14   21.6  
2  392.83   4.03   34.7  
3  394.63   2.94   33.4  
4  396.90   5.33   36.2  

Dataset Info:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 506 entries, 0 to 505
Data columns (total 14 columns):
 #   Column   Non-Null Count  Dtype  
---  ------   --------------  -----  
 0   crim     506 non-null    float64
 1   zn       506 non-null    float64
 2   indus    506 non-null    float64
 3   chas     506 non-nu