In [1]:
import pandas as pd
import matplotlib as plt
from sklearn.model_selection import KFold
from sklearn.preprocessing import LabelEncoder
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn.svm import SVR
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
from sklearn.metrics import root_mean_squared_error
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score, root_mean_squared_error

In [2]:
# Load the datasets
data1 = pd.read_csv('aqardata_2.csv')
data2 = pd.read_csv('DocRealestateSale.csv')
data3 = pd.reas_csv("DocRealestateSale2023Q3.csv")
data4 = pd.read_csv("Transactionssaleforrealestate.csv")

In [3]:
#Showing 
print(data.shape)
data.head()

(2951, 8)


Unnamed: 0,mainlocation,sublocation,neighborhood,frontage,purpose,streetwidth,size,Pricepm
0,الرياض,غرب الرياض,حي ظهرة لبن,شمال,سكني,20.0,727,1800
1,بريدة,,حي مشعل,غرب,,15.0,450,950
2,الخبر,,حي الحمرا,غرب,تجاري,100.0,1450,3500
3,الخبر,,حي الحزام الاخضر,شرق,,15.0,440,2700
4,بريدة,,حي الرحاب,جنوب غربي,,40.0,784,950


In [4]:
# Separate features and target variable
X = data.drop('Pricepm', axis=1) # All columns except 'Pricepm'
y = data['Pricepm']     # The target column 'Pricepm'

In [5]:
# Display the first few rows of X and y to be sure about our work
print(X.head())

  mainlocation sublocation      neighborhood   frontage purpose  streetwidth  \
0       الرياض  غرب الرياض       حي ظهرة لبن       شمال    سكني         20.0   
1        بريدة         NaN           حي مشعل        غرب     NaN         15.0   
2        الخبر         NaN         حي الحمرا        غرب   تجاري        100.0   
3        الخبر         NaN  حي الحزام الاخضر        شرق     NaN         15.0   
4        بريدة         NaN         حي الرحاب  جنوب غربي     NaN         40.0   

   size  
0   727  
1   450  
2  1450  
3   440  
4   784  


In [6]:
print(y.head())

0    1800
1     950
2    3500
3    2700
4     950
Name: Pricepm, dtype: int64


In [7]:
# Encode categorical variables with LabelEncoder
encoder = LabelEncoder()
X_encoded = X.apply(encoder.fit_transform)

In [8]:
# Initialize the KFold object with shuffle and random_state
kf = KFold(n_splits=5, shuffle=True, random_state=24)

In [9]:
# Initialize a list to store the mean squared errors, mean absolute errors, RMSE, and R2 scores for each model
evaluation_scores = []

In [10]:
# Define the models to evaluate
models = [
    ('Linear Regression', LinearRegression()),
    ('Random Forest', RandomForestRegressor(random_state=35)),
    ('Gradient Boosting', GradientBoostingRegressor(random_state=35)),
    ('Support Vector Regression', SVR())
]

In [11]:
# Iterate over the models and perform cross-validation
for model_name, model in models:
    # Initialize lists to store the evaluation scores for each fold
    fold_mse_scores = []
    fold_mae_scores = []
    fold_rmse_scores = []
    fold_r2_scores = []


In [12]:
# Perform cross-validation
for train_index, test_index in kf.split(X_encoded):
    # Split the data into training and testing sets for this fold
    X_train, X_test = X_encoded.iloc[train_index], X_encoded.iloc[test_index]
    y_train, y_test = y.iloc[train_index], y.iloc[test_index]

In [13]:
 # Fit the model on the training data
model.fit(X_train, y_train)

In [14]:
# Make predictions on the testing data
y_pred = model.predict(X_test)

In [15]:
# Calculate the evaluation scores for this fold
fold_mse = mean_squared_error(y_test, y_pred)
fold_mae = mean_absolute_error(y_test, y_pred)
fold_rmse = root_mean_squared_error(y_test, y_pred)  # Use the new function
fold_r2 = r2_score(y_test, y_pred)


In [16]:
# Append the evaluation scores to the respective lists
fold_mse_scores.append(fold_mse)
fold_mae_scores.append(fold_mae)
fold_rmse_scores.append(fold_rmse)
fold_r2_scores.append(fold_r2)


In [17]:
# Calculate the mean of the evaluation scores for this model
avg_mse = sum(fold_mse_scores) / len(fold_mse_scores)
avg_mae = sum(fold_mae_scores) / len(fold_mae_scores)
avg_rmse = sum(fold_rmse_scores) / len(fold_rmse_scores)
avg_r2 = sum(fold_r2_scores) / len(fold_r2_scores)

In [18]:
# Append the average evaluation scores to the list
evaluation_scores.append((model_name, avg_mse, avg_mae, avg_rmse, avg_r2))

In [19]:
# Sort the models based on their average mean squared error in ascending order
evaluation_scores.sort(key=lambda x: x[1])

In [20]:
# Get the best performing model and its corresponding evaluation scores
best_model, best_mse, best_mae, best_rmse, best_r2 = evaluation_scores[0]

In [21]:
# Print the best performing model and its evaluation scores
print(f"Best Model: {best_model}")
print(f"Mean Squared Error: {best_mse}")
print(f"Mean Absolute Error: {best_mae}")
print(f"Root Mean Squared Error: {best_rmse}")
print(f"R2 Score: {best_r2}")

Best Model: Support Vector Regression
Mean Squared Error: 2281199.511433343
Mean Absolute Error: 1244.920315204445
Root Mean Squared Error: 1510.364032752814
R2 Score: 0.011479267964290663
