Samuel Bitton : 2246844

Ricardo Caster : 342688405

https://github.com/RicardoCaster/lemidat_mehona.git

In [1]:
import pandas as pd
import numpy as np

In [15]:
df = pd.read_excel('C:/Users/sam_b/Desktop/train.xlsx')
df.head()

Unnamed: 0,property_type,neighborhood,address,room_num,floor,area,garden_area,days_to_enter,num_of_payments,monthly_arnona,...,ac,handicap,has_bars,has_safe_room,has_balcony,is_furnished,is_renovated,num_of_images,distance_from_center,price
0,דירה,הצפון הישן החלק המרכזי,"מהר""ל 25",3.0,2,71,,0.0,12.0,467.0,...,1,0.0,0,1,1,0,0,6.0,1005.0,10150.0
1,דירה,הצפון הישן החלק המרכזי,ארלוזורוב 35,3.0,1,70,,0.0,12.0,240.0,...,1,0.0,1,0,1,0,0,3.0,253.0,6600.0
2,דירה,הצפון הישן החלק המרכזי,וורמיזה 5,2.5,1,65,,,12.0,400.0,...,1,1.0,0,0,1,0,1,8.0,740.0,9000.0
3,דירה,הצפון הישן החלק המרכזי,עמנואל הרומי 30,2.0,3,40,,0.0,12.0,100.0,...,0,0.0,0,0,0,0,0,2.0,1206.0,5800.0
4,דירה,הצפון הישן החלק המרכזי,ארלוזורוב 50,3.0,1,70,,0.0,11.0,250.0,...,1,0.0,1,0,0,0,1,5.0,255.0,7700.0


In [16]:
def prepare_data(df, mode="train"):
    import pandas as pd
    import numpy as np
    
### Clean and preprocess both TRAIN and TEST datasets automatically The function detects whether 'price' exists to decide how to treat the data (point 4 below).
    
    # --- 1. Keep only regular apartments ---
    df = df[df['property_type'] == 'דירה']

    # --- 2. Drop irrelevant columns ---
    columns_to_drop = ['address', 'description', 'num_of_payments', 'num_of_images']
    df = df.drop(columns=[col for col in columns_to_drop if col in df.columns], errors='ignore')

    # --- 3. Convert important columns to numeric ---
    numeric_cols = ['room_num', 'area', 'price', 'floor', 'total_floors',
                    'monthly_arnona', 'building_tax', 'days_to_enter', 'distance_from_center']
    for col in numeric_cols:
        if col in df.columns:
            df[col] = pd.to_numeric(df[col], errors='coerce')

    # --- 4. Drop rows with unrealistic price only if we are in training set
    if 'price' in df.columns:
        df = df[df['price'].notna()]
        df = df[(df['price'] >= 1000) & (df['price'] <= 50000)]

    # --- 5. Remove unrealistic area values ---
    df = df[df['area'] >= 10]

    # --- 6. Remove rows without neighborhood ---
    df = df[df['neighborhood'].notna()]

    # --- 7. Impute room_num when equal to 0 using similar apartments by area ---
    def impute_room_num(row, reference_df):
        if row['room_num'] == 0:
            similar = reference_df[
                (reference_df['area'] >= row['area'] - 5) &
                (reference_df['area'] <= row['area'] + 5) &
                (reference_df['room_num'] > 0)
            ]
            if not similar.empty:
                return round(similar['room_num'].median(), 1)
            else:
                return round(reference_df['room_num'].median(), 1)
        return row['room_num']

    if 'room_num' in df.columns and 'area' in df.columns:
        reference_df = df[df['room_num'] > 0]
        df['room_num'] = df.apply(lambda row: impute_room_num(row, reference_df), axis=1)

    # --- 8. Fix total_floors if it's less than floor ---
    if 'floor' in df.columns and 'total_floors' in df.columns:
        df.loc[(df['floor'] > df['total_floors']) & df['total_floors'].notna(), 'total_floors'] = df['floor']

    # --- 9. Fill missing total_floors using floor or median ---
    if 'total_floors' in df.columns:
        median_total = df[df['total_floors'] > 0]['total_floors'].median()

        def fix_missing_total_floors(row):
            if pd.isna(row['total_floors']):
                if not pd.isna(row['floor']):
                    return max(row['floor'], median_total)
                else:
                    return median_total
            return row['total_floors']

        df['total_floors'] = df.apply(fix_missing_total_floors, axis=1)

    # --- 10. Fill missing values in each column with reasonable defaults ---
    if 'floor' in df.columns:
        df['floor'] = df['floor'].fillna(df['floor'].median())
    if 'days_to_enter' in df.columns:
        df['days_to_enter'] = df['days_to_enter'].fillna(df['days_to_enter'].median())
    if 'distance_from_center' in df.columns:
        df['distance_from_center'] = df['distance_from_center'].fillna(df['distance_from_center'].median())
    if 'monthly_arnona' in df.columns:
        df['monthly_arnona'] = df['monthly_arnona'].fillna(df['monthly_arnona'].mean())
    if 'building_tax' in df.columns:
        df['building_tax'] = df['building_tax'].fillna(df['building_tax'].mean())
    if 'garden_area' in df.columns:
        df['garden_area'] = df['garden_area'].fillna(0)

    # --- 11. Encode 'neighborhood' with one-hot encoding ---
    if 'neighborhood' in df.columns:
        df = pd.get_dummies(df, columns=['neighborhood'], drop_first=True)

    return df


## Data Cleaning Explanation

1. Removed irrelevant columns
We dropped the columns: address, description, num_of_payments, and num_of_images.

address was removed because we already used neighborhood, which provides enough location information.

description contains free text that we did not analyze, so it adds noise but no value.

num_of_payments and num_of_images are not meaningful features for predicting rent price — they describe marketing or listing behavior, not the property itself.

2. Filtered for regular apartments
We kept only rows where property_type is equal to "דירה" (apartment).

This was done to focus only on standard residential apartments, and not include commercial units, houses, or other real estate types that follow different pricing logic.

3. Dropped rows with missing critical values
We removed rows where price or neighborhood was missing.

These are essential features: price is our target variable, and neighborhood is one of the most influential predictors.

Without them, the row cannot be used for training or prediction.


4. Removed unrealistic values
Rows where area < 10 square meters were deleted.

Rows where price < 1000 or price > 50000 were also removed.

 These were considered outliers or errors that could distort the model’s learning.

5. Fixed invalid number of rooms
If room_num == 0, we replaced it using the median number of rooms for similar apartments with the same area (±5m²).

A value of 0 rooms is unrealistic, so we used nearby data to infer a reasonable value without removing the row.

6. Corrected inconsistent total floors
If floor > total_floors, we set total_floors = floor.

An apartment cannot be on the 5th floor of a 4-floor building, so we assumed a minimum total equal to its floor.

If total_floors was missing:

We used the maximum of floor or the median of total_floors to replace it.


7. Filled missing values by column
We filled missing values using different strategies depending on the column:

-For floor, we used the median value. This avoids the influence of extreme values and keeps the data realistic.

-For total_floors, if missing, we applied logic: if the apartment’s floor is known, we took the maximum between the current floor and the median of total_floors.

-For days_to_enter, we used the median, since it's a neutral value that doesn’t suggest immediate or delayed availability.

-For distance_from_center, we filled missing values with the median, to avoid bias from outliers.

-For monthly_arnona and building_tax, we used the mean because those are regular numeric costs that vary smoothly.
For garden_area, we used 0 assuming that if no value was recorded, the apartment most likely has no garden.


8. Encoded categorical variable
We used one-hot encoding on the column neighborhood with drop_first=True.

This allows the model to use neighborhoods as numeric input while avoiding multicollinearity.


In [17]:
df = pd.read_excel('C:/Users/sam_b/Desktop/train.xlsx')
df = prepare_data(df)
df.head()


Unnamed: 0,property_type,room_num,floor,area,garden_area,days_to_enter,monthly_arnona,building_tax,total_floors,has_parking,...,neighborhood_קרית שלום,neighborhood_רביבים,neighborhood_רמת אביב ג,neighborhood_רמת אביב החדשה,neighborhood_שבזי,neighborhood_שיכון בבלי,neighborhood_שפירא,neighborhood_תל ברוך צפון,neighborhood_תל חיים,neighborhood_תל כביר
0,דירה,3.0,2.0,71,0.0,0.0,467.0,614.0,4.0,1,...,0,0,0,0,0,0,0,0,0,0
1,דירה,3.0,1.0,70,0.0,0.0,240.0,190.0,4.0,0,...,0,0,0,0,0,0,0,0,0,0
2,דירה,2.5,1.0,65,0.0,0.0,400.0,150.0,4.0,1,...,0,0,0,0,0,0,0,0,0,0
3,דירה,2.0,3.0,40,0.0,0.0,100.0,100.0,3.0,0,...,0,0,0,0,0,0,0,0,0,0
4,דירה,3.0,1.0,70,0.0,0.0,250.0,50.0,4.0,0,...,0,0,0,0,0,0,0,0,0,0


In [18]:
df.isna().sum().sort_values(ascending=False)


property_type                               0
neighborhood_נוה שאנן                       0
neighborhood_יד אליהו                       0
neighborhood_יפו ד                          0
neighborhood_כפיר                           0
                                           ..
neighborhood_הצפון החדש סביבת ככר המדינה    0
neighborhood_הצפון הישן החלק הדרום מזרחי    0
neighborhood_הצפון הישן החלק הדרום מערבי    0
neighborhood_הצפון הישן החלק המרכזי         0
neighborhood_תל כביר                        0
Length: 67, dtype: int64

# elastic net

In [19]:
from sklearn.linear_model import ElasticNet
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import mean_absolute_error, r2_score
from sklearn.metrics import mean_squared_error
import pandas as pd


df = pd.read_excel('C:/Users/sam_b/Desktop/train.xlsx')
# Clean and preprocess the data using the custom PrepareData function
df = prepare_data(df)

# Split features and target
X = df.drop(columns=['price', 'property_type'])  # Drop the target and unused category
y = df['price']

# Scale the features using standardization
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

# Split the dataset into training and test sets
X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, test_size=0.2, random_state=42)

# Initialize and train the ElasticNet model
model = ElasticNet(alpha=1.0, l1_ratio=0.3, random_state=42)
model.fit(X_train, y_train)

# Predict on the test set
y_pred = model.predict(X_test)

# Evaluate model performance
mae = mean_absolute_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)
rmse_elastic = mean_squared_error(y_test, y_pred, squared=False)

# Print performance metrics
print(f"Mean Absolute Error (MAE): {mae:.2f}")
print(f"ElasticNet RMSE: {rmse_elastic:.2f}")
print(f"R² Score: {r2:.2f}")



Mean Absolute Error (MAE): 1594.69
ElasticNet RMSE: 2357.22
R² Score: 0.59


# 10 fold cross validation

In [10]:
from sklearn.linear_model import ElasticNet
from sklearn.model_selection import KFold, cross_val_score, cross_val_predict
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import mean_absolute_error, r2_score
from sklearn.metrics import mean_squared_error
import pandas as pd
import numpy as np

# Load the dataset
df = pd.read_excel('C:/Users/sam_b/Desktop/train.xlsx')
df = prepare_data(df)



# Separate features and target
X = df.drop(columns=['price', 'property_type'])
y = df['price']

# Standardize the features
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

# Initialize the model
model = ElasticNet(alpha=1.0, l1_ratio=0.3, random_state=42)

# Define 10-fold cross-validation
kf = KFold(n_splits=10, shuffle=True, random_state=42)

# Perform cross-validated predictions
y_pred = cross_val_predict(model, X_scaled, y, cv=kf)

# Evaluate performance across all folds
mae = mean_absolute_error(y, y_pred)
r2 = r2_score(y, y_pred)
rmse = mean_squared_error(y, y_pred, squared=False)

# Print metrics
print(f"10-Fold Cross-Validated MAE: {mae:.2f}")
print(f"10-Fold Cross-Validated RMSE: {rmse:.2f}")
print(f"10-Fold Cross-Validated R² Score: {r2:.2f}")


10-Fold Cross-Validated MAE: 1822.67
10-Fold Cross-Validated RMSE: 3237.06
10-Fold Cross-Validated R² Score: 0.46


# top 5 features

In [11]:
# Identifier les 5 features avec l'influence la plus forte sur le prix (positif ou négatif)

import numpy as np
import pandas as pd
from sklearn.linear_model import ElasticNet
from sklearn.preprocessing import StandardScaler

# Recharger les données nettoyées
df = pd.read_excel('C:/Users/sam_b/Desktop/train.xlsx')
df = prepare_data(df)

# Séparer X et y
X = df.drop(columns=['price', 'property_type'])
y = df['price']

# Standardisation
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

# Entraîner ElasticNet
model = ElasticNet(alpha=1.0, l1_ratio=0.3, random_state=42)
model.fit(X_scaled, y)

# Extraire les coefficients
coef = pd.Series(model.coef_, index=X.columns)

# Obtenir les 5 features les plus influentes (en valeur absolue)
top_5_features = coef.reindex(coef.abs().sort_values(ascending=False).head(5).index)

top_5_features


area                 780.763688
room_num             602.682624
is_furnished         431.831220
building_tax         415.854050
neighborhood_שבזי    372.280167
dtype: float64

# Trees

#  DecisionTreeRegressor

In [12]:
from sklearn.tree import DecisionTreeRegressor
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_absolute_error, r2_score
from sklearn.metrics import mean_squared_error
import pandas as pd

# Load and prepare the data
df = pd.read_excel('C:/Users/sam_b/Desktop/train.xlsx')
df = prepare_data(df)
# Split features and target
X = df.drop(columns=['price', 'property_type'])  # Drop target and unused category
y = df['price']

# No need to scale features for tree-based models
# Split the data into training and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Initialize and train the decision tree regressor
model = DecisionTreeRegressor(random_state=42)
model.fit(X_train, y_train)

# Predict on the test set
y_pred = model.predict(X_test)

# Evaluate model performance
mae = mean_absolute_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)
rmse_rf = mean_squared_error(y_test, y_pred, squared=False)

# Print evaluation metrics
print(f"Decision Tree MAE: {mae:.2f}")
print(f"Random Forest RMSE: {rmse_rf:.2f}")
print(f"Decision Tree R² Score: {r2:.2f}")


Decision Tree MAE: 1882.15
Random Forest RMSE: 2711.52
Decision Tree R² Score: 0.46


# Random Forest Regressor

In [13]:
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_absolute_error, r2_score
from sklearn.metrics import mean_squared_error
import pandas as pd

# Load and prepare the data
df = pd.read_excel('C:/Users/sam_b/Desktop/train.xlsx')
df = prepare_data(df)
# Split features and target
X = df.drop(columns=['price', 'property_type'])  # Drop target and unused category
y = df['price']

# No need to scale features for tree-based models
# Split the data into training and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Initialize and train the Random Forest model
model = RandomForestRegressor(n_estimators=100, random_state=42)
model.fit(X_train, y_train)

# Predict on the test set
y_pred = model.predict(X_test)

# Evaluate model performance
mae = mean_absolute_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)
rmse_rf = mean_squared_error(y_test, y_pred, squared=False)

# Print evaluation metrics
print(f"Random Forest MAE: {mae:.2f}")
print(f"Random Forest RMSE: {rmse_rf:.2f}")
print(f"Random Forest R² Score: {r2:.2f}")


Random Forest MAE: 1301.06
Random Forest RMSE: 1845.47
Random Forest R² Score: 0.75


# RandomizedSearchCV 

In [14]:
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import RandomizedSearchCV, train_test_split
from sklearn.metrics import mean_absolute_error, r2_score
from sklearn.metrics import mean_squared_error
import pandas as pd

# Load and prepare the data
df = pd.read_excel('C:/Users/sam_b/Desktop/train.xlsx')
df = prepare_data(df)

# Separate features and target
X = df.drop(columns=['price', 'property_type'])
y = df['price']

# Split into train/test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Define hyperparameter space
param_dist = {
    'n_estimators': [100, 200, 300],
    'max_depth': [None, 10, 20, 30],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4]
}

# Set up RandomizedSearchCV
rf = RandomForestRegressor(random_state=42)
random_search = RandomizedSearchCV(
    estimator=rf,
    param_distributions=param_dist,
    n_iter=10,
    cv=5,
    scoring='neg_mean_absolute_error',
    random_state=42,
    n_jobs=-1
)

# Fit search
random_search.fit(X_train, y_train)

# Best model and evaluation
best_model = random_search.best_estimator_
y_pred = best_model.predict(X_test)
mae = mean_absolute_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)
rmse_rf = mean_squared_error(y_test, y_pred, squared=False)

print("Best hyperparameters found:", random_search.best_params_)
print(f"MAE after Randomized Search: {mae:.2f}")
print(f"RMSE after Randomized Search : {rmse_rf:.2f}")
print(f"R² Score after Randomized Search: {r2:.2f}")


Best hyperparameters found: {'n_estimators': 300, 'min_samples_split': 2, 'min_samples_leaf': 1, 'max_depth': 30}
MAE after Randomized Search: 1303.42
RMSE after Randomized Search : 1829.03
R² Score after Randomized Search: 0.75


In [32]:
best_model = random_search.best_estimator_
best_model

RandomForestRegressor(max_depth=30, n_estimators=300, random_state=42)

We used RandomizedSearchCV to optimize the hyperparameters of the Random Forest model.
It is faster than Grid Search and sufficient for this case, as it samples a wide range of combinations without checking them all.
This helps improve performance while saving computational time.

# 10-fold cross-validation

In [33]:

kf = KFold(n_splits=10, shuffle=True, random_state=42)
y_pred_cv = cross_val_predict(best_model, X, y, cv=kf)
mae_cv = mean_absolute_error(y, y_pred_cv)
r2_cv = r2_score(y, y_pred_cv)
rmse_cv = mean_squared_error(y, y_pred_cv, squared=False)

print(f"10-Fold CV MAE: {mae_cv:.2f}")
print(f"10-Fold CV RMSE: {rmse_cv:.2f}")
print(f"10-Fold CV R²: {r2_cv:.2f}")

10-Fold CV MAE: 1575.98
10-Fold CV RMSE: 2752.36
10-Fold CV R²: 0.61


# top 5 features

In [34]:
# Extract and display the top 5 most important features from the trained Random Forest model

import pandas as pd

# Get feature importances from the trained model
feature_importances = pd.Series(best_model.feature_importances_, index=X.columns)

# Sort and get the top 5
top_5_rf_features = feature_importances.sort_values(ascending=False).head(5)

# Display
print(top_5_rf_features)


area                    0.592579
building_tax            0.056902
distance_from_center    0.043701
monthly_arnona          0.038843
total_floors            0.031799
dtype: float64


# Model Comparison

In [35]:
# Create a comparison table between ElasticNet and RandomForest, now including RMSE as well
model_comparison = pd.DataFrame({
    'Model': ['ElasticNet', 'Random Forest'],
    'MAE (Test Set)': [1594.69, 1301.06],
    'RMSE (Test Set)': [2357.22, 1845.47],
    'R² (Test Set)': [0.59, 0.75],
    'MAE (10-Fold CV)': [1822.67, 1575.98],
    'RMSE (10-Fold CV)': [3237.06, 2752.36],
    'R² (10-Fold CV)': [0.46, 0.61]
})


model_comparison

Unnamed: 0,Model,MAE (Test Set),RMSE (Test Set),R² (Test Set),MAE (10-Fold CV),RMSE (10-Fold CV),R² (10-Fold CV)
0,ElasticNet,1594.69,2357.22,0.59,1822.67,3237.06,0.46
1,Random Forest,1301.06,1845.47,0.75,1575.98,2752.36,0.61


We compared two regression models: ElasticNet and Random Forest.
Based on both test set performance and 10-fold cross-validation, the Random Forest model consistently outperformed ElasticNet.
It achieved a lower Mean Absolute Error (MAE), a lower Root Mean Squared Error (RMSE), and a higher R² score, indicating better predictive accuracy and a stronger ability to explain the variance in apartment prices.
ElasticNet, being a linear model, is simpler but less capable of capturing complex, nonlinear relationships in the data.
In contrast, Random Forest, an ensemble of decision trees, adapts better to diverse patterns and interactions between features.

# Comparison of important features between the models

There is some overlap between the most important features in both models.
For example, both ElasticNet and Random Forest showed that area and building tax are very important for predicting apartment price.
But each model also focused on different things:
ElasticNet found features like number of rooms and furnished status important,
while Random Forest gave more importance to features like distance from the center and monthly arnona.
This is normal, because Random Forest can find more complex patterns that ElasticNet cannot.

# Explanation of the differences between the models

The main difference between ElasticNet and Random Forest is how they learn from the data.
ElasticNet is a linear model, which means it can only find straight-line relationships between features and the price.
Random Forest is a non-linear model, so it can detect more complex patterns and interactions between features.
This is why Random Forest had better performance in MAE, RMSE, and R².
In simple cases, ElasticNet may work well, but in more complicated datasets like this one, Random Forest is usually more accurate.

In [36]:
import pickle

# Save ElasticNet model
with open("elasticnet_model.pkl", "wb") as f:
    pickle.dump(model, f)

# Save Random Forest model
with open("random_forest_model.pkl", "wb") as f:
    pickle.dump(best_model, f)


In [37]:
import pickle
import pandas as pd

# === Load new data to predict on ===
df_test = pd.read_excel("X_test.xlsx")  # Replace with actual test file otherwise the code will give an error
df_test = prepare_data(df_test)         # Use your existing cleaning function

# === Prepare features for prediction ===
X_test = df_test.drop(columns=["price"], errors="ignore")  # If 'price' is not in test, no problem

# === Load and predict with ElasticNet model ===
with open("elasticnet_model.pkl", "rb") as f:
    elasticnet_loaded = pickle.load(f)

y_pred_elasticnet = elasticnet_loaded.predict(X_test)
print("ElasticNet Predictions:")
print(y_pred_elasticnet)

# === Load and predict with Random Forest model ===
with open("random_forest_model.pkl", "rb") as f:
    rf_loaded = pickle.load(f)

y_pred_rf = rf_loaded.predict(X_test)
print("Random Forest Predictions:")
print(y_pred_rf)


FileNotFoundError: [Errno 2] No such file or directory: 'X_test.xlsx'