## Step 1: Import Libraries

In [1]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.feature_selection import SelectKBest, f_regression

# Ignore warnings
import warnings
warnings.filterwarnings("ignore")


## Step 2: Load and Explore the Dataset

In [2]:
#load the dataset
data_frame = pd.read_csv(r"C:\Users\user\Omdena\machine-learning-linear-regression-carolynewambura06\BostonHousing.csv")
data_frame.head()

Unnamed: 0,crim,zn,indus,chas,nox,rm,age,dis,rad,tax,ptratio,b,lstat,medv
0,0.00632,18.0,2.31,0,0.538,6.575,65.2,4.09,1,296,15.3,396.9,4.98,24.0
1,0.02731,0.0,7.07,0,0.469,6.421,78.9,4.9671,2,242,17.8,396.9,9.14,21.6
2,0.02729,0.0,7.07,0,0.469,7.185,61.1,4.9671,2,242,17.8,392.83,4.03,34.7
3,0.03237,0.0,2.18,0,0.458,6.998,45.8,6.0622,3,222,18.7,394.63,2.94,33.4
4,0.06905,0.0,2.18,0,0.458,7.147,54.2,6.0622,3,222,18.7,396.9,5.33,36.2


## Step 3: Handle Missing Values

In [3]:
# Check for missing values
print(data_frame.isnull().sum())

crim       0
zn         0
indus      0
chas       0
nox        0
rm         0
age        0
dis        0
rad        0
tax        0
ptratio    0
b          0
lstat      0
medv       0
dtype: int64


## Step 4: Encoding categorical Varibales

In [4]:
#getting info 
data_frame.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 506 entries, 0 to 505
Data columns (total 14 columns):
 #   Column   Non-Null Count  Dtype  
---  ------   --------------  -----  
 0   crim     506 non-null    float64
 1   zn       506 non-null    float64
 2   indus    506 non-null    float64
 3   chas     506 non-null    int64  
 4   nox      506 non-null    float64
 5   rm       506 non-null    float64
 6   age      506 non-null    float64
 7   dis      506 non-null    float64
 8   rad      506 non-null    int64  
 9   tax      506 non-null    int64  
 10  ptratio  506 non-null    float64
 11  b        506 non-null    float64
 12  lstat    506 non-null    float64
 13  medv     506 non-null    float64
dtypes: float64(11), int64(3)
memory usage: 55.5 KB


In [5]:
data_frame.head(15)

Unnamed: 0,crim,zn,indus,chas,nox,rm,age,dis,rad,tax,ptratio,b,lstat,medv
0,0.00632,18.0,2.31,0,0.538,6.575,65.2,4.09,1,296,15.3,396.9,4.98,24.0
1,0.02731,0.0,7.07,0,0.469,6.421,78.9,4.9671,2,242,17.8,396.9,9.14,21.6
2,0.02729,0.0,7.07,0,0.469,7.185,61.1,4.9671,2,242,17.8,392.83,4.03,34.7
3,0.03237,0.0,2.18,0,0.458,6.998,45.8,6.0622,3,222,18.7,394.63,2.94,33.4
4,0.06905,0.0,2.18,0,0.458,7.147,54.2,6.0622,3,222,18.7,396.9,5.33,36.2
5,0.02985,0.0,2.18,0,0.458,6.43,58.7,6.0622,3,222,18.7,394.12,5.21,28.7
6,0.08829,12.5,7.87,0,0.524,6.012,66.6,5.5605,5,311,15.2,395.6,12.43,22.9
7,0.14455,12.5,7.87,0,0.524,6.172,96.1,5.9505,5,311,15.2,396.9,19.15,27.1
8,0.21124,12.5,7.87,0,0.524,5.631,100.0,6.0821,5,311,15.2,386.63,29.93,16.5
9,0.17004,12.5,7.87,0,0.524,6.004,85.9,6.5921,5,311,15.2,386.71,17.1,18.9


In [6]:
#encode categorical variables 'rad' using get_dummies
# Apply One-Hot Encoding to RAD
df = pd.get_dummies(data_frame, columns=['rad'], prefix='rad', dtype=int)

df.head(10)

Unnamed: 0,crim,zn,indus,chas,nox,rm,age,dis,tax,ptratio,...,medv,rad_1,rad_2,rad_3,rad_4,rad_5,rad_6,rad_7,rad_8,rad_24
0,0.00632,18.0,2.31,0,0.538,6.575,65.2,4.09,296,15.3,...,24.0,1,0,0,0,0,0,0,0,0
1,0.02731,0.0,7.07,0,0.469,6.421,78.9,4.9671,242,17.8,...,21.6,0,1,0,0,0,0,0,0,0
2,0.02729,0.0,7.07,0,0.469,7.185,61.1,4.9671,242,17.8,...,34.7,0,1,0,0,0,0,0,0,0
3,0.03237,0.0,2.18,0,0.458,6.998,45.8,6.0622,222,18.7,...,33.4,0,0,1,0,0,0,0,0,0
4,0.06905,0.0,2.18,0,0.458,7.147,54.2,6.0622,222,18.7,...,36.2,0,0,1,0,0,0,0,0,0
5,0.02985,0.0,2.18,0,0.458,6.43,58.7,6.0622,222,18.7,...,28.7,0,0,1,0,0,0,0,0,0
6,0.08829,12.5,7.87,0,0.524,6.012,66.6,5.5605,311,15.2,...,22.9,0,0,0,0,1,0,0,0,0
7,0.14455,12.5,7.87,0,0.524,6.172,96.1,5.9505,311,15.2,...,27.1,0,0,0,0,1,0,0,0,0
8,0.21124,12.5,7.87,0,0.524,5.631,100.0,6.0821,311,15.2,...,16.5,0,0,0,0,1,0,0,0,0
9,0.17004,12.5,7.87,0,0.524,6.004,85.9,6.5921,311,15.2,...,18.9,0,0,0,0,1,0,0,0,0


## Step 6: Normalize/ Standardize Numerical Features

In [7]:
# Define features (X) and target variable (y)
X = df.drop(columns=['medv'])
y = df['medv']

# Standardize features
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

# Convert back to DataFrame
X = pd.DataFrame(X_scaled, columns=X.columns)


In [8]:
#apply SelectKBest class to extract top 15 best features
bestfeatures = SelectKBest(score_func=f_regression, k=10)

#train to find best features
fit = bestfeatures.fit(X,y)

#save in the dataframe 
dfscores = pd.DataFrame(fit.scores_)
dfcolumns = pd.DataFrame(X.columns)

#concat two dataframes for better visualization 
featureScores = pd.concat([dfcolumns,dfscores],axis=1)

#naming the dataframe columns
featureScores.columns = ['Specs','Score'] 

#print 12 best features 
print(featureScores.nlargest(10,'Score'))  

      Specs       Score
11    lstat  601.617871
5        rm  471.846740
9   ptratio  175.105543
2     indus  153.954883
8       tax  141.761357
4       nox  112.591480
20   rad_24   93.901047
0      crim   89.486115
6       age   83.477459
1        zn   75.257642


## Step 7: Split Data into Training & Testing Sets

In [9]:
# 1. Select top features (using your results)
selected_features = ['lstat', 'rm', 'ptratio', 'indus', 'tax', 'nox', 'rad_24', 'crim', 'age', 'zn']  # Top 10 from your output

# 2. Filter DataFrame to keep only selected features
X_selected = X[selected_features]

In [10]:
# Split into 80% training and 20% testing
X_train, X_test, y_train, y_test = train_test_split( X_selected, y, test_size=0.3, random_state=42)

# Print shapes
print(f"Train shape: {X_train.shape}, Test shape: {X_test.shape}")


Train shape: (354, 10), Test shape: (152, 10)


### Save Dataframe into CSV file

In [11]:
X_train.to_csv('X_train.csv', index = False)
X_test.to_csv('X_test.csv', index = False)
y_train.to_csv('y_train.csv', index = False)
y_test.to_csv('y_test.csv', index = False)