In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import MinMaxScaler
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split


### Identify Missing Values

In [2]:
data = pd.read_csv('../data/bostonhousing.csv')

# Check for missing values
missing_values = data.isnull().sum()
print("Missing values in each column:\n", missing_values)

# Check if there are any missing values in the entire dataset
if missing_values.sum() == 0:
    print("There are no missing values in the dataset.")
else:
    print("There are missing values in the dataset.")

Missing values in each column:
 crim       0
zn         0
indus      0
chas       0
nox        0
rm         0
age        0
dis        0
rad        0
tax        0
ptratio    0
b          0
lstat      0
medv       0
dtype: int64
There are no missing values in the dataset.


### Handle Missing Values
Since the Boston Housing Dataset typically does not have missing values, we'll assume there are no missing values to handle. However, if there were missing values, you could handle them using methods like imputation.

In [3]:
# Check for missing values
if missing_values.sum() > 0:
    
    # Example: Fill missing values with the mean of the column
    data.fillna(data.mean(), inplace=True)
    print("Missing values have been filled with the mean of each column.")
else:
    print("No missing values to handle.")

No missing values to handle.


### Identify Outliers
Let's identify outliers using the Interquartile Range (IQR) method.

In [7]:
# List of features to check for outliers
features = ['crim', 'zn', 'indus', 'chas', 'nox', 'rm', 'age', 'dis', 'rad', 'tax', 'ptratio', 'b', 'lstat', 'medv']

# Function to identify outliers using IQR
def identify_outliers(data, column):
    Q1 = data[column].quantile(0.25)
    Q3 = data[column].quantile(0.75)
    IQR = Q3 - Q1
    lower_bound = Q1 - 1.5 * IQR
    upper_bound = Q3 + 1.5 * IQR
    outliers = data[(data[column] < lower_bound) | (data[column] > upper_bound)]
    return outliers

# Identify outliers for each feature
outliers_info = {}
for feature in features:
    outliers = identify_outliers(data, feature)
    outliers_info[feature] = outliers
    print(f"Number of outliers in {feature}: {len(outliers)}")

Number of outliers in crim: 66
Number of outliers in zn: 68
Number of outliers in indus: 0
Number of outliers in chas: 35
Number of outliers in nox: 0
Number of outliers in rm: 30
Number of outliers in age: 0
Number of outliers in dis: 5
Number of outliers in rad: 0
Number of outliers in tax: 0
Number of outliers in ptratio: 15
Number of outliers in b: 77
Number of outliers in lstat: 7
Number of outliers in medv: 40


### Handle Outliers
To handle outliers, we can choose to remove them or cap them. Here, we'll remove outliers based on the IQR method.

In [8]:
# Function to remove outliers using IQR
def remove_outliers(data, column):
    Q1 = data[column].quantile(0.25)
    Q3 = data[column].quantile(0.75)
    IQR = Q3 - Q1
    lower_bound = Q1 - 1.5 * IQR
    upper_bound = Q3 + 1.5 * IQR
    filtered_data = data[(data[column] >= lower_bound) & (data[column] <= upper_bound)]
    return filtered_data

# Remove outliers for each feature
for feature in features:
    data = remove_outliers(data, feature)

print("Outliers have been removed from the dataset.")

Outliers have been removed from the dataset.


### Using Pandas for One-Hot Encoding

In [None]:
# Adding a new categorical variable 'neighborhood'
data['neighborhood'] = ['A' if i < 250 else 'B' for i in range(len(data))]

# One-Hot Encoding using pandas
data_encoded = pd.get_dummies(data, columns=['neighborhood'], drop_first=True)

print(data_encoded.head())

      crim    zn  indus  chas    nox     rm   age     dis  rad  tax  ptratio  \
0  0.00632  18.0   2.31     0  0.538  6.575  65.2  4.0900    1  296     15.3   
1  0.02731   0.0   7.07     0  0.469  6.421  78.9  4.9671    2  242     17.8   
5  0.02985   0.0   2.18     0  0.458  6.430  58.7  6.0622    3  222     18.7   
6  0.08829  12.5   7.87     0  0.524  6.012  66.6  5.5605    5  311     15.2   
7  0.14455  12.5   7.87     0  0.524  6.172  96.1  5.9505    5  311     15.2   

        b  lstat  medv  
0  396.90   4.98  24.0  
1  396.90   9.14  21.6  
5  394.12   5.21  28.7  
6  395.60  12.43  22.9  
7  396.90  19.15  27.1  


### Using Scikit-Learn for Label Encoding

In [5]:
# Adding a new categorical variable 'neighborhood'
data['neighborhood'] = ['A' if i < 250 else 'B' for i in range(len(data))]

# Label Encoding using scikit-learn
label_encoder = LabelEncoder()
data['neighborhood_encoded'] = label_encoder.fit_transform(data['neighborhood'])

print(data.head())


       crim    zn     indus  chas       nox        rm       age       dis  \
0  0.000000  0.18  0.067815     0  0.314815  0.577505  0.641607  0.269203   
1  0.000236  0.00  0.242302     0  0.172840  0.547998  0.782698  0.348962   
2  0.000236  0.00  0.242302     0  0.172840  0.694386  0.599382  0.348962   
3  0.000293  0.00  0.063050     0  0.150206  0.658555  0.441813  0.448545   
4  0.000705  0.00  0.063050     0  0.150206  0.687105  0.528321  0.448545   

        rad       tax   ptratio         b     lstat  medv neighborhood  \
0  0.000000  0.208015  0.287234  1.000000  0.089680  24.0            A   
1  0.043478  0.104962  0.553191  1.000000  0.204470  21.6            A   
2  0.043478  0.104962  0.553191  0.989737  0.063466  34.7            A   
3  0.086957  0.066794  0.648936  0.994276  0.033389  33.4            A   
4  0.086957  0.066794  0.648936  1.000000  0.099338  36.2            A   

   neighborhood_encoded  
0                     0  
1                     0  
2             

### Normalize Numerical Features

In [4]:
# List of numerical features
numerical_features = ['crim', 'zn', 'indus', 'nox', 'rm', 'age', 'dis', 'rad', 'tax', 'ptratio', 'b', 'lstat']

# Initialize the MinMaxScaler
scaler = MinMaxScaler()

# Normalize the numerical features
data[numerical_features] = scaler.fit_transform(data[numerical_features])

print("Normalized Data:")
print(data[numerical_features].head())

Normalized Data:
       crim    zn     indus       nox        rm       age       dis       rad  \
0  0.000000  0.18  0.067815  0.314815  0.577505  0.641607  0.269203  0.000000   
1  0.000236  0.00  0.242302  0.172840  0.547998  0.782698  0.348962  0.043478   
2  0.000236  0.00  0.242302  0.172840  0.694386  0.599382  0.348962  0.043478   
3  0.000293  0.00  0.063050  0.150206  0.658555  0.441813  0.448545  0.086957   
4  0.000705  0.00  0.063050  0.150206  0.687105  0.528321  0.448545  0.086957   

        tax   ptratio         b     lstat  
0  0.208015  0.287234  1.000000  0.089680  
1  0.104962  0.553191  1.000000  0.204470  
2  0.104962  0.553191  0.989737  0.063466  
3  0.066794  0.648936  0.994276  0.033389  
4  0.066794  0.648936  1.000000  0.099338  


### Standardize Numerical Features

In [6]:
# Initialize the StandardScaler
scaler = StandardScaler()

# Standardize the numerical features
data[numerical_features] = scaler.fit_transform(data[numerical_features])

print("Standardized Data:")
print(data[numerical_features].head())

Standardized Data:
       crim        zn     indus       nox        rm       age       dis  \
0 -0.419782  0.284830 -1.287909 -0.144217  0.413672 -0.120013  0.140214   
1 -0.417339 -0.487722 -0.593381 -0.740262  0.194274  0.367166  0.557160   
2 -0.417342 -0.487722 -0.593381 -0.740262  1.282714 -0.265812  0.557160   
3 -0.416750 -0.487722 -1.306878 -0.835284  1.016303 -0.809889  1.077737   
4 -0.412482 -0.487722 -1.306878 -0.835284  1.228577 -0.511180  1.077737   

        rad       tax   ptratio         b     lstat  
0 -0.982843 -0.666608 -1.459000  0.441052 -1.075562  
1 -0.867883 -0.987329 -0.303094  0.441052 -0.492439  
2 -0.867883 -0.987329 -0.303094  0.396427 -1.208727  
3 -0.752922 -1.106115  0.113032  0.416163 -1.361517  
4 -0.752922 -1.106115  0.113032  0.441052 -1.026501  


### Split the Data into Training and Testing Sets

In [22]:
# List of numerical features
numerical_features = ['crim', 'zn', 'indus', 'nox', 'rm', 'age', 'dis', 'rad', 'tax', 'ptratio', 'b', 'lstat']

# Target variable
target = 'medv'

# Split the data into features (X) and target (y)
X = data[numerical_features]
y = data[target]

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Print the shapes of the training and testing sets
print("Training set shapes:")
print("X_train:", X_train.shape)
print("y_train:", y_train.shape)

print("\nTesting set shapes:")
print("X_test:", X_test.shape)
print("y_test:", y_test.shape)

Training set shapes:
X_train: (404, 12)
y_train: (404,)

Testing set shapes:
X_test: (102, 12)
y_test: (102,)
