In [32]:
pip install --upgrade notebook


Collecting notebook
  Downloading notebook-7.3.3-py3-none-any.whl.metadata (10 kB)
Collecting jupyterlab-server<3,>=2.27.1 (from notebook)
  Downloading jupyterlab_server-2.27.3-py3-none-any.whl.metadata (5.9 kB)
Collecting jupyterlab<4.4,>=4.3.6 (from notebook)
  Downloading jupyterlab-4.3.6-py3-none-any.whl.metadata (16 kB)
Downloading notebook-7.3.3-py3-none-any.whl (13.1 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m13.1/13.1 MB[0m [31m2.2 MB/s[0m eta [36m0:00:00[0m00:01[0m00:01[0mm
[?25hDownloading jupyterlab-4.3.6-py3-none-any.whl (11.7 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m11.7/11.7 MB[0m [31m4.9 MB/s[0m eta [36m0:00:00[0m00:01[0m00:01[0m
[?25hDownloading jupyterlab_server-2.27.3-py3-none-any.whl (59 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m59.7/59.7 kB[0m [31m5.0 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: jupyterlab-server, jupyterlab, notebook
  Attempting uninsta

In [20]:
# Import necessary libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression, Ridge
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.model_selection import cross_val_score

In [2]:
# Load the dataset
df = pd.read_csv('../data/boston_housing.csv')

In [3]:
df_encoded = pd.get_dummies(df, columns=['rad'], prefix='rad', drop_first=True)

In [4]:
# Separate features and target
X = df_encoded.drop('medv', axis=1)
y = df_encoded['medv']

In [5]:
# Standardize numerical features
numerical_cols = ['crim', 'zn', 'indus', 'nox', 'rm', 'age', 'dis', 'tax', 'ptratio', 'b', 'lstat']
scaler = StandardScaler()
X[numerical_cols] = scaler.fit_transform(X[numerical_cols])

In [6]:
# Split the data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [7]:
# 2. Train a Linear Regression Model
lr_model = LinearRegression()
lr_model.fit(X_train, y_train)

In [9]:
# Predict and evaluate
y_pred_lr = lr_model.fit(X_train, y_train)
y_pred_lr = lr_model.predict(X_test)
lr_mse = mean_squared_error(y_test, y_pred_lr)
lr_r2 = r2_score(y_test, y_pred_lr)

print("\nLinear Regression Performance:")
print(f"Mean Squared Error (MSE): {lr_mse:.2f}")
print(f"R^2 Score: {lr_r2:.2f}")


Linear Regression Performance:
Mean Squared Error (MSE): 24.82
R^2 Score: 0.66


# Feature Engineering

In [15]:
# Creating new features for our dataset

def create_new_features(df):
    # 1. Ratio features
    df['rooms_per_tax'] = df['rm'] / df['tax']
    df['nox_to_dis'] = df['nox'] / df['dis']
    
    # 2. Interaction features
    df['rm_age'] = df['rm'] * df['age']
    df['dis_rad'] = df['dis'] * df['rad']
    
    # 3. Polynomial features
    df['rm_squared'] = df['rm'] ** 2
    df['age_squared'] = df['age'] ** 2
    
    # 4. Log transformations (adding small constant to avoid log(0))
    df['log_crim'] = np.log(df['crim'] + 0.0001)
    df['log_dis'] = np.log(df['dis'])
    
    # 5. Combined economic indicator
    df['economic_index'] = df['tax'] / df['ptratio']
    
    return df

In [26]:
# Apply feature engineering before encoding 'rad'
df = create_new_features(df)

# Now one-hot encode 'rad'
df_encoded = pd.get_dummies(df, columns=['rad'], prefix='rad', drop_first=True)


In [27]:
# Separate features and target
X = df_encoded.drop('medv', axis=1)
y = df_encoded['medv']

In [28]:
# Standardize numerical features
numerical_cols = ['crim', 'zn', 'indus', 'nox', 'rm', 'age', 'dis', 'tax', 'ptratio', 'b', 'lstat',
                 'rooms_per_tax', 'nox_to_dis', 'rm_age', 'dis_rad', 'rm_squared', 'age_squared',
                 'log_crim', 'log_dis', 'economic_index']
scaler = StandardScaler()
X[numerical_cols] = scaler.fit_transform(X[numerical_cols])

In [29]:
# Define feature sets
rad_columns = [col for col in df_encoded.columns if col.startswith('rad_')]
feature_sets = {
    'original': ['crim', 'zn', 'indus', 'chas', 'nox', 'rm', 'age', 'dis', 'tax', 
                 'ptratio', 'b', 'lstat'] + rad_columns,
    'original_plus_ratios': ['crim', 'zn', 'indus', 'chas', 'nox', 'rm', 'age', 'dis', 
                           'tax', 'ptratio', 'b', 'lstat', 'rooms_per_tax', 'nox_to_dis'] + rad_columns,
    'original_plus_interactions': ['crim', 'zn', 'indus', 'chas', 'nox', 'rm', 'age', 'dis', 
                                'tax', 'ptratio', 'b', 'lstat', 'rm_age', 'dis_rad'] + rad_columns,
    'original_plus_polynomial': ['crim', 'zn', 'indus', 'chas', 'nox', 'rm', 'age', 'dis', 
                               'tax', 'ptratio', 'b', 'lstat', 'rm_squared', 'age_squared'] + rad_columns,
    'original_plus_log': ['crim', 'zn', 'indus', 'chas', 'nox', 'rm', 'age', 'dis', 
                         'tax', 'ptratio', 'b', 'lstat', 'log_crim', 'log_dis'] + rad_columns,
    'all_features': ['crim', 'zn', 'indus', 'chas', 'nox', 'rm', 'age', 'dis', 'tax', 
                    'ptratio', 'b', 'lstat', 'rooms_per_tax', 'nox_to_dis', 'rm_age', 
                    'dis_rad', 'rm_squared', 'age_squared', 'log_crim', 'log_dis', 
                    'economic_index'] + rad_columns
}

In [30]:
# Function to evaluate features
def evaluate_features(X, y, feature_sets):
    model = LinearRegression()
    results = {}
    
    for name, features in feature_sets.items():
        scores = cross_val_score(model, X[features], y, cv=5, scoring='r2')
        mse_scores = -cross_val_score(model, X[features], y, cv=5, scoring='neg_mean_squared_error')
        results[name] = {
            'mean_r2': np.mean(scores),
            'std_r2': np.std(scores),
            'mean_mse': np.mean(mse_scores),
            'std_mse': np.std(mse_scores)
        }
    return results

In [31]:
# Evaluate and print results
results = evaluate_features(X, y, feature_sets)
for name, metrics in results.items():
    print(f"\n{name}:")
    print(f"Mean R²: {metrics['mean_r2']:.3f} (±{metrics['std_r2']:.3f})")
    print(f"Mean MSE: {metrics['mean_mse']:.3f} (±{metrics['std_mse']:.3f})")


original:
Mean R²: 0.325 (±0.411)
Mean MSE: 37.889 (±22.987)

original_plus_ratios:
Mean R²: 0.430 (±0.307)
Mean MSE: 32.157 (±18.273)

original_plus_interactions:
Mean R²: 0.463 (±0.264)
Mean MSE: 32.542 (±20.968)

original_plus_polynomial:
Mean R²: 0.635 (±0.191)
Mean MSE: 22.227 (±17.224)

original_plus_log:
Mean R²: 0.459 (±0.263)
Mean MSE: 31.367 (±19.987)

all_features:
Mean R²: 0.596 (±0.228)
Mean MSE: 23.620 (±16.041)


In [33]:
!jupyter nbconvert --to script my_notebook.ipynb


Traceback (most recent call last):
  File "/opt/homebrew/anaconda3/bin/jupyter-nbconvert", line 7, in <module>
    from nbconvert.nbconvertapp import main
  File "/opt/homebrew/anaconda3/lib/python3.12/site-packages/nbconvert/nbconvertapp.py", line 187, in <module>
    class NbConvertApp(JupyterApp):
  File "/opt/homebrew/anaconda3/lib/python3.12/site-packages/nbconvert/nbconvertapp.py", line 246, in NbConvertApp
    Options include {get_export_names()}.
                     ^^^^^^^^^^^^^^^^^^
  File "/opt/homebrew/anaconda3/lib/python3.12/site-packages/nbconvert/exporters/base.py", line 151, in get_export_names
    e = get_exporter(exporter_name)(config=config)
        ^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/opt/homebrew/anaconda3/lib/python3.12/site-packages/nbconvert/exporters/base.py", line 110, in get_exporter
    exporter = items[0].load()
               ^^^^^^^^^^^^^^^
  File "/opt/homebrew/anaconda3/lib/python3.12/importlib/metadata/__init__.py", line 205, in load
    module = imp

In [34]:
pip show notebook


Name: notebook
Version: 7.3.3
Summary: Jupyter Notebook - A web-based notebook environment for interactive computing
Home-page: 
Author: 
Author-email: Jupyter Development Team <jupyter@googlegroups.com>
License: BSD 3-Clause License

- Copyright (c) 2001-2015, IPython Development Team
- Copyright (c) 2015-, Jupyter Development Team

All rights reserved.

Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are met:

1. Redistributions of source code must retain the above copyright notice, this
   list of conditions and the following disclaimer.

2. Redistributions in binary form must reproduce the above copyright notice,
   this list of conditions and the following disclaimer in the documentation
   and/or other materials provided with the distribution.

3. Neither the name of the copyright holder nor the names of its
   contributors may be used to endorse or promote products derived from
   this software

In [35]:
pip install --upgrade notebook-shim nbconvert


Collecting notebook-shim
  Downloading notebook_shim-0.2.4-py3-none-any.whl.metadata (4.0 kB)
Collecting nbconvert
  Using cached nbconvert-7.16.6-py3-none-any.whl.metadata (8.5 kB)
Downloading notebook_shim-0.2.4-py3-none-any.whl (13 kB)
Using cached nbconvert-7.16.6-py3-none-any.whl (258 kB)
Installing collected packages: nbconvert, notebook-shim
  Attempting uninstall: nbconvert
    Found existing installation: nbconvert 7.10.0
    Uninstalling nbconvert-7.10.0:
      Successfully uninstalled nbconvert-7.10.0
  Attempting uninstall: notebook-shim
    Found existing installation: notebook_shim 0.2.3
    Uninstalling notebook_shim-0.2.3:
      Successfully uninstalled notebook_shim-0.2.3
Successfully installed nbconvert-7.16.6 notebook-shim-0.2.4
Note: you may need to restart the kernel to use updated packages.


In [38]:
pip install --upgrade nbconvert


Note: you may need to restart the kernel to use updated packages.


In [39]:
jupyter nbconvert --to script my_notebook.ipynb


SyntaxError: invalid syntax (426049048.py, line 1)