In [11]:

# Function to clean and convert
def clean_and_convert(price_str):
    # Remove any non-numeric characters (including Arabic commas)
    cleaned_price = ''.join(c for c in price_str if c.isdigit() or c == '.')

    # Convert to float
    try:
        return float(cleaned_price)
    except ValueError:
        # Handle cases where the conversion fails
        return None  # or another appropriate value


In [12]:
import pandas as pd
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.impute import SimpleImputer  # Import the imputer

# Load your dataset
land = pd.read_csv('land.csv')

land.head()
# Apply the cleaning function to 'the price' column
land['the price'] = land['the price'].apply(clean_and_convert)
land['Space'] = land['Space'].apply(clean_and_convert)  # Apply the appropriate cleaning function for 'space'

features = land.drop('the price', axis=1)
features['Space'] = land['Space']
target = land['the price']

# Convert categorical columns to numerical using Label Encoding
label_encoder = LabelEncoder()
for column in features.select_dtypes(include=['object']).columns:
    features[column] = label_encoder.fit_transform(features[column])

# Impute missing values using SimpleImputer
imputer = SimpleImputer(strategy='mean')
features_imputed = pd.DataFrame(imputer.fit_transform(features), columns=features.columns)

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(features_imputed, target, test_size=0.1, random_state=42)

# Create a Random Forest Regressor
regressor = RandomForestRegressor(random_state=42)

# Train the regressor on the training data
regressor.fit(X_train, y_train)

# Make predictions on the test set
predictions = regressor.predict(X_test)

# Evaluate model performance for regression
mse = mean_squared_error(y_test, predictions)
r2 = r2_score(y_test, predictions)

print(f'Mean Squared Error: {mse}')
print(f'R-squared: {r2}')




Mean Squared Error: 4392506126844.7803
R-squared: 0.5863619906552313


In [13]:
import pandas as pd
from sklearn.experimental import enable_hist_gradient_boosting  # noqa
from sklearn.ensemble import HistGradientBoostingRegressor
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder

# Function to clean and convert 'the price' column to numeric
def clean_and_convert_price(value):
    if isinstance(value, str):
        try:
            # Remove Arabic comma and convert to float
            cleaned_value = float(value.replace('٬', '').replace(',', ''))
            return cleaned_value
        except ValueError:
            return pd.NaT  # Return NaT (Not a Time) for values that couldn't be converted
    else:
        return value  # Return the original value for non-string elements

# Load your dataset (replace 'data.csv' with the actual filename or path to your dataset)
land = pd.read_csv('land.csv')

# Apply the cleaning function to 'the price' column
land['the price'] = land['the price'].apply(clean_and_convert_price)

# Extract features and target variable
features = land.drop('the price', axis=1)
target = land['the price']

# Convert categorical columns to numerical using Label Encoding
label_encoder = LabelEncoder()
for column in features.select_dtypes(include=['object']).columns:
    features[column] = label_encoder.fit_transform(features[column])

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(features, target, test_size=0.2, random_state=42)




In [18]:
# Assuming you have the required functions and the model loaded

# Sample data
new_data = pd.DataFrame({
    'Region': ['Riyadh region'],
    'City': ['Riyadh'],
    'City / neighborhood': ['Riyadh/Al -Nazim'],
    'The reference number of the deal': ['20051562.0'],  # Convert to string
    'The date of the deal AD': ['2023/05/30'],
    'Hijri date of the deal': ['11/11/1444'],
    'Real estate classification': ['residential'],
    'Real estate': ['piece of ground'],
    'The number of real estate': ['1.0'],  # Convert to string
    'Space': ['81,640,539']
})



# Update the year to 2024
new_data['The date of the deal AD'] = new_data['The date of the deal AD'].str.replace('2023', '2024')
new_data['Hijri date of the deal'] = new_data['Hijri date of the deal'].str.replace('1444', '1445')

# Apply the cleaning function to 'the price' column
new_data['the price'] = new_data['Space'].apply(clean_and_convert_price)

# Convert 'Space' to numeric
new_data['Space'] = new_data['Space'].str.replace(',', '').astype(float)



# Combine the training and new data for consistent label encoding
combined_data = pd.concat([features, new_data])

# Convert all columns to strings before label encoding
combined_data = combined_data.astype(str)

# Convert categorical columns to numerical using Label Encoding
label_encoder = LabelEncoder()

# Fit and transform during training
for column in combined_data.columns:
    combined_data[column] = label_encoder.fit_transform(combined_data[column])

# Split the combined data back into training and new data
features_encoded = combined_data.iloc[:len(features)]
new_data_encoded = combined_data.iloc[len(features):]

# Assume 'new_data_encoded' has the same features as your training data (excluding 'the price')
new_data_features = new_data_encoded.drop('the price', axis=1)

# Make predictions
predicted_price = regressor.predict(new_data_features)

# Print the predicted price
print(f'Predicted Price: {predicted_price[0]}')


Predicted Price: 9419298.34
