In [33]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split, GridSearchCV, RandomizedSearchCV
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import r2_score, mean_absolute_error, mean_squared_error
import re
import scipy.stats as stats

In [19]:
import gdown

ID = "https://drive.google.com/drive/folders/1RZJdNpCfbMt2SB_kfWsxQgOk5LeuV7lw"
gdown.download_folder(ID, quiet=True)
df = pd.read_csv('Divar Dataset/Divar.csv', low_memory=False)

# Display basic info
print(df.info())
print(df.head())

# Check for missing values
print(df.isnull().sum())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000000 entries, 0 to 999999
Data columns (total 61 columns):
 #   Column                      Non-Null Count    Dtype  
---  ------                      --------------    -----  
 0   Unnamed: 0                  1000000 non-null  int64  
 1   cat2_slug                   1000000 non-null  object 
 2   cat3_slug                   999999 non-null   object 
 3   city_slug                   999998 non-null   object 
 4   neighborhood_slug           437139 non-null   object 
 5   created_at_month            1000000 non-null  object 
 6   user_type                   288882 non-null   object 
 7   description                 1000000 non-null  object 
 8   title                       999946 non-null   object 
 9   rent_mode                   352994 non-null   object 
 10  rent_value                  351322 non-null   float64
 11  rent_to_single              19 non-null       object 
 12  rent_type                   103961 non-null   object 
 13

In [21]:
# Create a copy of the DataFrame to avoid SettingWithCopyWarning
df_clean = df.copy()

# Drop columns with high missingness or irrelevant to prediction
columns_to_drop = ['description', 'title', 'rent_mode', 'rent_value', 'rent_to_single',
                   'rent_type', 'price_mode', 'price_value', 'credit_mode', 'credit_value',
                   'rent_credit_transform', 'transformed_credit', 'transformed_rent']

df_clean = df_clean.drop(columns=columns_to_drop)

# Drop rows where target is missing
df_clean = df_clean[df_clean['transformable_price'].notna()]

# Convert Farsi numbers to English
def farsi_to_english(text):
    farsi_digits = '۰۱۲۳۴۵۶۷۸۹'
    english_digits = '0123456789'
    if pd.isna(text):
        return text
    text = str(text)
    for f, e in zip(farsi_digits, english_digits):
        text = text.replace(f, e)
    return text

# Apply to all columns
for col in df_clean.columns:
    if df_clean[col].dtype == 'object':
        df_clean[col] = df_clean[col].apply(farsi_to_english)

# Fill missing numerical values with median
num_cols = df_clean.select_dtypes(include=np.number).columns
for col in num_cols:
    df_clean[col] = df_clean[col].fillna(df_clean[col].median())

# Fill categorical missing values with mode
cat_cols = df_clean.select_dtypes(include='object').columns
for col in cat_cols:
    mode_val = df_clean[col].mode()
    if not mode_val.empty:
        df_clean[col] = df_clean[col].fillna(mode_val.iloc[0])
    else:
        # If no mode exists, fill with a placeholder
        df_clean[col] = df_clean[col].fillna('Unknown')

# Encode categorical variables
label_encoders = {}
for col in cat_cols:
    le = LabelEncoder()
    df_clean[col] = le.fit_transform(df_clean[col].astype(str))
    label_encoders[col] = le

In [25]:
# Select features and target
X = df_clean.drop('transformable_price', axis=1)
y = df_clean['transformable_price']

# First split: separate out test set (20%)
X_train_val, X_test, y_train_val, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)

# Second split: separate validation set from training
X_train, X_val, y_train, y_val = train_test_split(
    X_train_val, y_train_val, test_size=0.25, random_state=42
)

print(f"Training set size: {X_train.shape}")
print(f"Validation set size: {X_val.shape}")
print(f"Test set size: {X_test.shape}")

Training set size: (211736, 47)
Validation set size: (70579, 47)
Test set size: (70579, 47)


In [37]:
# Define parameter distributions for Random Forest
param_dist = {
    'n_estimators': stats.randint(50, 300),
    'max_depth': [None] + list(range(5, 50, 5)),
    'min_samples_split': stats.randint(2, 20),
    'min_samples_leaf': stats.randint(1, 10),
    'max_features': ['sqrt', 'log2'],
    'bootstrap': [True, False]
}

# Initialize the model
rf = RandomForestRegressor(random_state=42, n_jobs=-1)

# Initialize RandomizedSearchCV
random_search = RandomizedSearchCV(
    estimator=rf,
    param_distributions=param_dist,
    n_iter=4,  # Number of parameter settings to sample
    cv=3,  # 3-fold cross-validation
    scoring='r2',
    n_jobs=-1,
    random_state=42,
    verbose=1
)

# Fit RandomizedSearchCV
print("Starting hyperparameter tuning with RandomizedSearchCV...")
random_search.fit(X_train, y_train)

# Get the best parameters
best_params = random_search.best_params_
print(f"Best parameters: {best_params}")
print(f"Best cross-validation score: {random_search.best_score_:.4f}")

# Train the best model on the full training set
best_model = random_search.best_estimator_

Starting hyperparameter tuning with RandomizedSearchCV...
Fitting 3 folds for each of 4 candidates, totalling 12 fits
Best parameters: {'bootstrap': False, 'max_depth': 25, 'max_features': 'log2', 'min_samples_leaf': 5, 'min_samples_split': 2, 'n_estimators': 253}
Best cross-validation score: 0.1183


In [38]:
# Predict on validation set
y_val_pred = best_model.predict(X_val)

# Calculate metrics
r2_val = r2_score(y_val, y_val_pred)
mae_val = mean_absolute_error(y_val, y_val_pred)
mse_val = mean_squared_error(y_val, y_val_pred)
rmse_val = np.sqrt(mse_val)

print("\nValidation Set Evaluation Metrics:")
print(f'R2 Score: {r2_val:.4f}')
print(f'MAE: {mae_val:.4f}')
print(f'MSE: {mse_val:.4f}')
print(f'RMSE: {rmse_val:.4f}')


Validation Set Evaluation Metrics:
R2 Score: 0.1215
MAE: 0.2916
MSE: 0.1441
RMSE: 0.3796


In [39]:
# Retrain the best model on the combined training+validation data
print("Retraining on combined training+validation data...")
final_model = RandomForestRegressor(**best_params, random_state=42, n_jobs=-1)
final_model.fit(X_train_val, y_train_val)

# Predict on test set
y_test_pred = final_model.predict(X_test)

# Calculate metrics
r2_test = r2_score(y_test, y_test_pred)
mae_test = mean_absolute_error(y_test, y_test_pred)
mse_test = mean_squared_error(y_test, y_test_pred)
rmse_test = np.sqrt(mse_test)

print("\nFinal Test Set Evaluation Metrics:")
print(f'R2 Score: {r2_test:.4f}')
print(f'MAE: {mae_test:.4f}')
print(f'MSE: {mse_test:.4f}')
print(f'RMSE: {rmse_test:.4f}')

Retraining on combined training+validation data...

Final Test Set Evaluation Metrics:
R2 Score: 0.1254
MAE: 0.2908
MSE: 0.1436
RMSE: 0.3789
