In [1]:
import pandas as pd
import joblib
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression, Lasso
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn.svm import SVR
from sklearn.neighbors import KNeighborsRegressor
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score

In [2]:
# Load your dataset
df = pd.read_csv('apartment_data.csv')

In [3]:
df

Unnamed: 0,price,district,area_m2,age_building,building_floor,apartment_floor,number_windows,garage
0,2800000.0,Баянзүрх,60,10,12,12,2.0,Байхгүй
1,4200000.0,Сүхбаатар,106,0,4,4,4.0,Байгаа
2,95000000.0,Сонгинохайрхан,23,23,11,11,4.0,Байхгүй
3,3400000.0,Сүхбаатар,51,1,12,9,2.0,Байхгүй
4,130000000.0,Сонгинохайрхан,88,41,2,1,2.0,Байхгүй
...,...,...,...,...,...,...,...,...
533,160000000.0,Баянзүрх,61,6,16,14,2.0,Байгаа
534,1800000.0,Баянзүрх,54,0,16,8,2.0,Байгаа
535,120000000.0,Сонгинохайрхан,47,9,9,6,3.0,Байхгүй
536,130000000.0,Сүхбаатар,50,28,5,2,3.0,Байхгүй


In [4]:
df.isnull().sum()

price              0
district           0
area_m2            0
age_building       0
building_floor     0
apartment_floor    0
number_windows     0
garage             0
dtype: int64

In [5]:
df['district'].unique()

array(['Баянзүрх', 'Сүхбаатар', 'Сонгинохайрхан', 'Хан-Уул', 'Баянгол',
       'Чингэлтэй', 'Налайх'], dtype=object)

In [6]:
categorical_columns = ['district', 'garage']

# Convert categorical variables to dummy/indicator variables
df = pd.get_dummies(df, columns=categorical_columns, drop_first=True)

In [7]:
# Separate features and target variable
X = df.drop('price', axis=1)
y = df['price']

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [11]:
X

Unnamed: 0,area_m2,age_building,building_floor,apartment_floor,number_windows,district_Баянзүрх,district_Налайх,district_Сонгинохайрхан,district_Сүхбаатар,district_Хан-Уул,district_Чингэлтэй,garage_Байхгүй
0,60,10,12,12,2.0,True,False,False,False,False,False,True
1,106,0,4,4,4.0,False,False,False,True,False,False,False
2,23,23,11,11,4.0,False,False,True,False,False,False,True
3,51,1,12,9,2.0,False,False,False,True,False,False,True
4,88,41,2,1,2.0,False,False,True,False,False,False,True
...,...,...,...,...,...,...,...,...,...,...,...,...
533,61,6,16,14,2.0,True,False,False,False,False,False,False
534,54,0,16,8,2.0,True,False,False,False,False,False,False
535,47,9,9,6,3.0,False,False,True,False,False,False,True
536,50,28,5,2,3.0,False,False,False,True,False,False,True


In [8]:
# Models
models = {
    'Linear Regression': LinearRegression(),
    'Lasso Regression': Lasso(),
    'Random Forest': RandomForestRegressor(n_estimators=60),
    'Gradient Boosting': GradientBoostingRegressor(),
    'Support Vector Regressor': SVR(),
    'K-Nearest Neighbors': KNeighborsRegressor()
}

# Train and evaluate each model
for name, model in models.items():
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)

    # Evaluate
    mae = mean_absolute_error(y_test, y_pred)
    mse = mean_squared_error(y_test, y_pred)
    r2 = r2_score(y_test, y_pred)

    # Print results
    print(f"Model: {name}")
    print(f"Mean Absolute Error: {mae:.2f}")
    print(f"Mean Squared Error: {mse:.2f}")
    print(f"R-squared: {r2:.2f}")
    print("------------------------")

Model: Linear Regression
Mean Absolute Error: 69313248.26
Mean Squared Error: 7919586165689593.00
R-squared: 0.11
------------------------
Model: Lasso Regression
Mean Absolute Error: 69313247.92
Mean Squared Error: 7919585804011601.00
R-squared: 0.11
------------------------
Model: Random Forest
Mean Absolute Error: 67240705.28
Mean Squared Error: 8367265973058091.00
R-squared: 0.06
------------------------
Model: Gradient Boosting
Mean Absolute Error: 71851255.33
Mean Squared Error: 8987450895461792.00
R-squared: -0.01
------------------------
Model: Support Vector Regressor
Mean Absolute Error: 74768517.16
Mean Squared Error: 8909578779658369.00
R-squared: -0.00
------------------------
Model: K-Nearest Neighbors
Mean Absolute Error: 77805740.74
Mean Squared Error: 10137123737037038.00
R-squared: -0.14
------------------------


In [9]:
df['price'].mean()

102944052.04460967

In [10]:
model = RandomForestRegressor(n_estimators=60)  # Replace with your actual model
model.fit(X_train, y_train)

# Save the trained model to a file
joblib.dump(model, 'random_forest_model.joblib')

['random_forest_model.joblib']