In [8]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import r2_score, mean_absolute_error

In [11]:
# Load your dataset
file_path = 'house_data.csv'  # Replace with the path to your dataset
column_names = ['CRIM', 'ZN', 'INDUS', 'CHAS', 'NOX', 'RM', 'AGE', 'DIS', 'RAD', 'TAX', 'PTRATIO', 'B', 'LSTAT', 'price']

# Read CSV with space delimiter and assign column names
house_price_dataframe = pd.read_csv(file_path, delimiter=r'\s+', header=None, names=column_names)

In [12]:
#check the first five rows of the data
house_price_dataframe.head

<bound method NDFrame.head of         CRIM    ZN  INDUS  CHAS    NOX     RM   AGE     DIS  RAD    TAX  \
0    0.00632  18.0   2.31     0  0.538  6.575  65.2  4.0900    1  296.0   
1    0.02731   0.0   7.07     0  0.469  6.421  78.9  4.9671    2  242.0   
2    0.02729   0.0   7.07     0  0.469  7.185  61.1  4.9671    2  242.0   
3    0.03237   0.0   2.18     0  0.458  6.998  45.8  6.0622    3  222.0   
4    0.06905   0.0   2.18     0  0.458  7.147  54.2  6.0622    3  222.0   
..       ...   ...    ...   ...    ...    ...   ...     ...  ...    ...   
501  0.06263   0.0  11.93     0  0.573  6.593  69.1  2.4786    1  273.0   
502  0.04527   0.0  11.93     0  0.573  6.120  76.7  2.2875    1  273.0   
503  0.06076   0.0  11.93     0  0.573  6.976  91.0  2.1675    1  273.0   
504  0.10959   0.0  11.93     0  0.573  6.794  89.3  2.3889    1  273.0   
505  0.04741   0.0  11.93     0  0.573  6.030  80.8  2.5050    1  273.0   

     PTRATIO       B  LSTAT  price  
0       15.3  396.90   4.98   24

In [13]:
house_price_dataframe.describe()

Unnamed: 0,CRIM,ZN,INDUS,CHAS,NOX,RM,AGE,DIS,RAD,TAX,PTRATIO,B,LSTAT,price
count,506.0,506.0,506.0,506.0,506.0,506.0,506.0,506.0,506.0,506.0,506.0,506.0,506.0,506.0
mean,3.613524,11.363636,11.136779,0.06917,0.554695,6.284634,68.574901,3.795043,9.549407,408.237154,18.455534,356.674032,12.653063,22.532806
std,8.601545,23.322453,6.860353,0.253994,0.115878,0.702617,28.148861,2.10571,8.707259,168.537116,2.164946,91.294864,7.141062,9.197104
min,0.00632,0.0,0.46,0.0,0.385,3.561,2.9,1.1296,1.0,187.0,12.6,0.32,1.73,5.0
25%,0.082045,0.0,5.19,0.0,0.449,5.8855,45.025,2.100175,4.0,279.0,17.4,375.3775,6.95,17.025
50%,0.25651,0.0,9.69,0.0,0.538,6.2085,77.5,3.20745,5.0,330.0,19.05,391.44,11.36,21.2
75%,3.677083,12.5,18.1,0.0,0.624,6.6235,94.075,5.188425,24.0,666.0,20.2,396.225,16.955,25.0
max,88.9762,100.0,27.74,1.0,0.871,8.78,100.0,12.1265,24.0,711.0,22.0,396.9,37.97,50.0


In [14]:
house_price_dataframe.shape

(506, 14)

In [15]:
print(house_price_dataframe.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 506 entries, 0 to 505
Data columns (total 14 columns):
 #   Column   Non-Null Count  Dtype  
---  ------   --------------  -----  
 0   CRIM     506 non-null    float64
 1   ZN       506 non-null    float64
 2   INDUS    506 non-null    float64
 3   CHAS     506 non-null    int64  
 4   NOX      506 non-null    float64
 5   RM       506 non-null    float64
 6   AGE      506 non-null    float64
 7   DIS      506 non-null    float64
 8   RAD      506 non-null    int64  
 9   TAX      506 non-null    float64
 10  PTRATIO  506 non-null    float64
 11  B        506 non-null    float64
 12  LSTAT    506 non-null    float64
 13  price    506 non-null    float64
dtypes: float64(12), int64(2)
memory usage: 55.5 KB
None


In [17]:
print(house_price_dataframe.isnull().sum())

CRIM       0
ZN         0
INDUS      0
CHAS       0
NOX        0
RM         0
AGE        0
DIS        0
RAD        0
TAX        0
PTRATIO    0
B          0
LSTAT      0
price      0
dtype: int64


In [18]:
# Split features and target
X = house_price_dataframe.drop(['price'], axis=1)
Y = house_price_dataframe['price']


In [19]:
# Split the data into 60% train, 20% validation, and 20% test sets
# First split into 80% train and 20% test
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.2, random_state=2)

# Then split train set into 75% train and 25% validation
X_train, X_valid, Y_train, Y_valid = train_test_split(X_train, Y_train, test_size=0.25, random_state=2)

In [21]:
from xgboost import XGBRegressor


In [22]:
model = XGBRegressor()

In [23]:
# Train the model with X_train
model.fit(X_train, Y_train)

In [24]:
# Accuracy for prediction on training data
training_data_prediction = model.predict(X_train)

In [26]:
from sklearn import metrics
print("Training Data:")
score_1 = metrics.r2_score(Y_train, training_data_prediction)
score_2 = metrics.mean_absolute_error(Y_train, training_data_prediction)
print("R squared error : ", score_1)
print('Mean Absolute Error : ', score_2)

Training Data:
R squared error :  0.9999998075678893
Mean Absolute Error :  0.0028388183895904184


In [27]:
# Accuracy for prediction on validation data
validation_data_prediction = model.predict(X_valid)

print("Validation Data:")
score_1 = metrics.r2_score(Y_valid, validation_data_prediction)
score_2 = metrics.mean_absolute_error(Y_valid, validation_data_prediction)
print("R squared error : ", score_1)
print('Mean Absolute Error : ', score_2)

Validation Data:
R squared error :  0.8677337099704384
Mean Absolute Error :  2.3518617856620563


In [29]:
# Accuracy for prediction on test data
test_data_prediction = model.predict(X_test)

print("Test Data:")
score_1 = metrics.r2_score(Y_test, test_data_prediction)
score_2 = metrics.mean_absolute_error(Y_test, test_data_prediction)
print("R squared error : ", score_1)
print('Mean Absolute Error : ', score_2)

Test Data:
R squared error :  0.8933179051608079
Mean Absolute Error :  2.1698381760541126


In [30]:
from sklearn.svm import SVR
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import make_pipeline

# SVR model
svr_model = make_pipeline(StandardScaler(), SVR())
svr_model.fit(X_train, Y_train)

# Prediction on validation set
svr_predictions = svr_model.predict(X_valid)

# Evaluate SVR model
svr_r2 = metrics.r2_score(Y_valid, svr_predictions)
svr_mae = metrics.mean_absolute_error(Y_valid, svr_predictions)

print("SVR Model:")
print(f"R squared error : {svr_r2}")
print(f"Mean Absolute Error : {svr_mae}")


SVR Model:
R squared error : 0.6156710861611236
Mean Absolute Error : 3.3451992394251593


In [31]:
from sklearn.ensemble import RandomForestRegressor

# Random Forest model
rf_model = RandomForestRegressor(random_state=0)
rf_model.fit(X_train, Y_train)

# Prediction on validation set
rf_predictions = rf_model.predict(X_valid)

# Evaluate Random Forest model
rf_r2 = metrics.r2_score(Y_valid, rf_predictions)
rf_mae = metrics.mean_absolute_error(Y_valid, rf_predictions)

print("\nRandom Forest Model:")
print(f"R squared error : {rf_r2}")
print(f"Mean Absolute Error : {rf_mae}")



Random Forest Model:
R squared error : 0.8810249149195177
Mean Absolute Error : 2.1907425742574262


In [32]:
from sklearn.linear_model import LinearRegression

# Linear Regression model
lr_model = LinearRegression()
lr_model.fit(X_train, Y_train)

# Prediction on validation set
lr_predictions = lr_model.predict(X_valid)

# Evaluate Linear Regression model
lr_r2 = metrics.r2_score(Y_valid, lr_predictions)
lr_mae = metrics.mean_absolute_error(Y_valid, lr_predictions)

print("\nLinear Regression Model:")
print(f"R squared error : {lr_r2}")
print(f"Mean Absolute Error : {lr_mae}")



Linear Regression Model:
R squared error : 0.6796820858482173
Mean Absolute Error : 3.4464172226660152


In [33]:
from sklearn.preprocessing import PolynomialFeatures
from sklearn.pipeline import make_pipeline

# Create polynomial features
degree = 2  # Adjust the degree as needed
poly_features = PolynomialFeatures(degree=degree)
X_poly_train = poly_features.fit_transform(X_train)
X_poly_valid = poly_features.transform(X_valid)

# Polynomial Regression model
poly_model = LinearRegression()
poly_model.fit(X_poly_train, Y_train)

# Prediction on validation set
poly_predictions = poly_model.predict(X_poly_valid)

# Evaluate Polynomial Regression model
poly_r2 = metrics.r2_score(Y_valid, poly_predictions)
poly_mae = metrics.mean_absolute_error(Y_valid, poly_predictions)

print("\nPolynomial Regression Model (Degree={}):".format(degree))
print(f"R squared error : {poly_r2}")
print(f"Mean Absolute Error : {poly_mae}")



Polynomial Regression Model (Degree=2):
R squared error : 0.8015832253409266
Mean Absolute Error : 2.658022848322146
