In [12]:
pip install xgboost


Note: you may need to restart the kernel to use updated packages.


In [15]:
import pandas as pd

# Load the dataset to examine its structure and content
file_path = 'new_house_data.csv'
data = pd.read_csv(file_path)

# Display the first few rows of the dataset to understand its structure
data.head(), data.info()


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 434 entries, 0 to 433
Data columns (total 8 columns):
 #   Column      Non-Null Count  Dtype  
---  ------      --------------  -----  
 0   Unnamed: 0  434 non-null    int64  
 1   id          434 non-null    int64  
 2   Location    434 non-null    object 
 3   lat-lon     434 non-null    float64
 4   Area        434 non-null    object 
 5   Bed         434 non-null    int64  
 6   Bath        434 non-null    int64  
 7   Price       434 non-null    object 
dtypes: float64(1), int64(4), object(3)
memory usage: 27.2+ KB


(   Unnamed: 0  id                         Location   lat-lon        Area  Bed  \
 0           0   0  Block H, Bashundhara R-A, Dhaka  0.263433  1,600 sqft    3   
 1           1   1         Farmgate, Tejgaon, Dhaka  0.262859    900 sqft    2   
 2           3   3        Gulshan 1, Gulshan, Dhaka  0.263010  2,200 sqft    3   
 3           4   4                 Baridhara, Dhaka  0.263252  2,200 sqft    3   
 4           5   5           Bashundhara R-A, Dhaka  0.263335  3,000 sqft    4   
 
    Bath        Price  
 0     3  23 Thousand  
 1     2  23 Thousand  
 2     4  78 Thousand  
 3     3  77 Thousand  
 4     5  52 Thousand  ,
 None)

In [16]:
# Clean and preprocess the dataset

# Drop unnecessary columns
data = data.drop(columns=["Unnamed: 0", "id"])

# Split 'lat-lon' into latitude and longitude (assuming it is a normalized value for now)
# If lat-lon is already split in the original data, further clarification is needed
data['lat'] = data['lat-lon']
data['lon'] = data['lat-lon']
data = data.drop(columns=['lat-lon'])

# Convert 'Area' to a numerical value
data['Area'] = data['Area'].str.replace(',', '').str.extract(r'(\d+)').astype(float)

# Convert 'Price' to a numerical value (handling "Thousand")
data['Price'] = data['Price'].str.extract(r'(\d+)').astype(float) * 1000

# Verify the cleaned dataset
data.head()


Unnamed: 0,Location,Area,Bed,Bath,Price,lat,lon
0,"Block H, Bashundhara R-A, Dhaka",1600.0,3,3,23000.0,0.263433,0.263433
1,"Farmgate, Tejgaon, Dhaka",900.0,2,2,23000.0,0.262859,0.262859
2,"Gulshan 1, Gulshan, Dhaka",2200.0,3,4,78000.0,0.26301,0.26301
3,"Baridhara, Dhaka",2200.0,3,3,77000.0,0.263252,0.263252
4,"Bashundhara R-A, Dhaka",3000.0,4,5,52000.0,0.263335,0.263335


In [17]:
from sklearn.model_selection import train_test_split
from xgboost import XGBRegressor
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score

# Features and target variable
X = data[['Area', 'Bed', 'Bath', 'lat', 'lon']]
y = data['Price']

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Initialize and train the XGBoost regressor
xgb_model = XGBRegressor(random_state=42)
xgb_model.fit(X_train, y_train)

# Make predictions on the test set
y_pred = xgb_model.predict(X_test)

# Evaluate the model
mae = mean_absolute_error(y_test, y_pred)
mse = mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

mae, mse, r2


XGBoostError: 
XGBoost Library (libxgboost.dylib) could not be loaded.
Likely causes:
  * OpenMP runtime is not installed
    - vcomp140.dll or libgomp-1.dll for Windows
    - libomp.dylib for Mac OSX
    - libgomp.so for Linux and other UNIX-like OSes
    Mac OSX users: Run `brew install libomp` to install OpenMP runtime.

  * You are running 32-bit Python on a 64-bit OS

Error message(s): ["dlopen(/Users/laptopheaven/Downloads/House_rent_estimation-main/.venv/lib/python3.9/site-packages/xgboost/lib/libxgboost.dylib, 0x0006): Library not loaded: @rpath/libomp.dylib\n  Referenced from: <BBC4A126-D15A-3802-AD26-108872BA781A> /Users/laptopheaven/Downloads/House_rent_estimation-main/.venv/lib/python3.9/site-packages/xgboost/lib/libxgboost.dylib\n  Reason: tried: '/opt/homebrew/opt/libomp/lib/libomp.dylib' (no such file), '/System/Volumes/Preboot/Cryptexes/OS/opt/homebrew/opt/libomp/lib/libomp.dylib' (no such file), '/opt/homebrew/opt/libomp/lib/libomp.dylib' (no such file), '/System/Volumes/Preboot/Cryptexes/OS/opt/homebrew/opt/libomp/lib/libomp.dylib' (no such file)"]


In [8]:
# Retry training with limited boosting rounds to speed up the process
xgb_model = XGBRegressor(n_estimators=100, random_state=42)  # Limit to 100 boosting rounds
xgb_model.fit(X_train, y_train)

# Make predictions and evaluate the model
y_pred = xgb_model.predict(X_test)
mae = mean_absolute_error(y_test, y_pred)
mse = mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

mae, mse, r2


NameError: name 'XGBRegressor' is not defined