In [2]:
import seaborn as sns
import numpy as np 
import pandas as pd 
import matplotlib.pyplot as plt
import re
from sklearn.model_selection import train_test_split,GridSearchCV,KFold
from sklearn.metrics import mean_squared_error,r2_score
from sklearn.preprocessing import StandardScaler    
from xgboost import XGBRegressor
import time


In [9]:
df = pd.read_csv('C:\\Users\\Pedram\\OneDrive\\py\\webapp\\data\\housePrice.csv')

In [4]:
df["Area"] = df["Area"].apply(lambda x : re.sub(",","",x))
df["Area"] = pd.to_numeric(df["Area"],errors="coerce")

In [5]:
df.dropna(inplace=True)


In [6]:
df = df.drop("Price(USD)",axis=1)


In [7]:
df[["Parking","Warehouse","Elevator"]] = df[["Parking","Warehouse","Elevator"]].astype("int64")


In [8]:
def IQR(data,treshold=1.5):
    q1 = np.percentile(data,25)
    q3 = np.percentile(data,75)

    iqr = q3 - q1

    lower = q1 - treshold * iqr
    upper = q3 + treshold * iqr

    return lower , upper

In [9]:
lower_price, upper_price = IQR(df["Price"])
lower_area, upper_area = IQR(df["Area"])

# beacuse we dont have negative values for this columns
# therefor we did not calculate the lower limit
print(f"Upper limit for Price:{upper_price}")
print(f"Upper limit for Area:{upper_area}")

Upper limit for Price:12870000000.0
Upper limit for Area:199.0


In [10]:
df = df.query(f"(Area < {upper_area}) & (Price < {upper_price})")
df.head()

Unnamed: 0,Area,Room,Parking,Warehouse,Elevator,Address,Price
0,63,1,1,1,1,Shahran,1850000000.0
1,60,1,1,1,1,Shahran,1850000000.0
2,79,2,1,1,1,Pardis,550000000.0
3,95,2,1,1,1,Shahrake Qods,902500000.0
4,123,2,1,1,1,Shahrake Gharb,7000000000.0


In [11]:
address_dummy = pd.get_dummies(df['Address'])
df_final = df.merge(address_dummy, left_index = True, right_index = True)
df_final.drop(columns = 'Address', inplace = True)
df_final.head(3)

Unnamed: 0,Area,Room,Parking,Warehouse,Elevator,Price,Abazar,Abbasabad,Abuzar,Afsarieh,...,Waterfall,West Ferdows Boulevard,West Pars,Yaftabad,Yakhchiabad,Yousef Abad,Zafar,Zaferanieh,Zargandeh,Zibadasht
0,63,1,1,1,1,1850000000.0,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
1,60,1,1,1,1,1850000000.0,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
2,79,2,1,1,1,550000000.0,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False


In [12]:
X = df_final.drop("Price",axis="columns")
y = df_final["Price"]

In [13]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 42)


In [14]:
scaler = StandardScaler().fit(X_train)
X_train_s = scaler.transform(X_train)
X_test_s = scaler.transform(X_test)

In [15]:
#def parameter_finder (model, parameters):
    
    start = time.time()
    
    grid = GridSearchCV(model, 
                        param_grid = parameters, 
                        refit = True, 
                        cv = KFold(shuffle = True, random_state = 1), 
                        n_jobs = -1)
    grid_fit = grid.fit(X_train_s, y_train)
    best = grid_fit.best_estimator_
    y_pred = best.predict(X_test_s)
    
    train_score =best.score(X_train_s, y_train)
    test_score = best.score(X_test_s, y_test)
    RMSE = np.sqrt(mean_squared_error(y_test, y_pred))
    
    model_name = str(model).split('(')[0]
    
    end = time.time()
    
    print(f"The best parameters for {model_name} model is: {grid_fit.best_params_}")
    print("--" * 10)
    print(f"(R2 score) in the training set is {train_score:0.2%} for {model_name} model.")
    print(f"(R2 score) in the testing set is {test_score:0.2%} for {model_name} model.")
    print(f"RMSE is {RMSE:,} for {model_name} model.")
    print("--" * 10)
    print(f"Runtime of the program is: {end - start:0.2f}")
    
       
    return train_score, test_score, RMSE, y_pred

IndentationError: unexpected indent (2469842334.py, line 3)

In [None]:
xgb_regressor = XGBRegressor()

param_grid = {
    'n_estimators': [50, 100, 200,250,300],
    'learning_rate': [0.01, 0.1, 0.5,0.2],
    'max_depth': [3, 5, 7,9]
}

xgbr_train_score, xgbr_test_score, xgbr_RMSE, xgbr_pred = parameter_finder(xgb_regressor, param_grid)

The best parameters for XGBRegressor model is: {'learning_rate': 0.5, 'max_depth': 3, 'n_estimators': 200}
--------------------
(R2 score) in the training set is 94.20% for XGBRegressor model.
(R2 score) in the testing set is 86.36% for XGBRegressor model.
RMSE is 1,074,807,672.4527442 for XGBRegressor model.
--------------------
Runtime of the program is: 31.29


In [None]:
final_xgboost = XGBRegressor(learning_rate=0.5,
                            max_depth=3,
                            n_estimators=200
                            ,random_state = 1)

In [None]:
final_xgboost.fit(X_train_s, y_train)


0,1,2
,objective,'reg:squarederror'
,base_score,
,booster,
,callbacks,
,colsample_bylevel,
,colsample_bynode,
,colsample_bytree,
,device,
,early_stopping_rounds,
,enable_categorical,False


In [None]:
y_predfinal = final_xgboost.predict(X_test_s)


In [None]:
from sklearn.metrics import r2_score, mean_squared_error

r2 = r2_score(y_test, y_predfinal)
rmse = np.sqrt(mean_squared_error(y_test, y_predfinal))

print("R2 Score:", r2)
print("RMSE:", rmse)

R2 Score: 0.8636014774330517
RMSE: 1074807672.4527442


In [None]:
import joblib
joblib.dump(final_xgboost, "final_xgboost_model.pkl")
joblib.dump(scaler, "scaler.pkl")

print("Model and scaler saved!")

Model and scaler saved!


In [None]:
columns = df_final.columns
joblib.dump(columns, "model_columns.pkl")
print("Model columns saved!")

Model columns saved!


In [None]:
columns

Index(['Area', 'Room', 'Parking', 'Warehouse', 'Elevator', 'Price', 'Abazar',
       'Abbasabad', 'Abuzar', 'Afsarieh',
       ...
       'Waterfall', 'West Ferdows Boulevard', 'West Pars', 'Yaftabad',
       'Yakhchiabad', 'Yousef Abad', 'Zafar', 'Zaferanieh', 'Zargandeh',
       'Zibadasht'],
      dtype='object', length=193)

In [None]:
df_copy = df.copy() 
df_copy.to_csv("cleaned_dataset.csv", index=False)