In [1]:
import pandas as pd
import joblib
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression, Lasso
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn.svm import SVR
from sklearn.neighbors import KNeighborsRegressor
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score

In [2]:
# Loading my data
df = pd.read_csv('apartment_data.csv')

In [3]:
df

Unnamed: 0,district,area_m2,age_building,building_floor,apartment_floor,number_windows,garage,price
0,Хан-Уул,104,6,9,6,4.0,Байхгүй,2900000.0
1,Баянзүрх,60,10,12,12,2.0,Байхгүй,2800000.0
2,Сүхбаатар,106,0,4,4,4.0,Байгаа,4200000.0
3,Сүхбаатар,74,18,5,5,4.0,Байхгүй,185000000.0
4,Баянгол,80,35,9,8,4.0,Байхгүй,185000000.0
...,...,...,...,...,...,...,...,...
535,Баянзүрх,61,6,16,14,2.0,Байгаа,160000000.0
536,Баянзүрх,54,0,16,8,2.0,Байгаа,1800000.0
537,Сонгинохайрхан,47,9,9,6,3.0,Байхгүй,120000000.0
538,Сүхбаатар,50,28,5,2,3.0,Байхгүй,130000000.0


In [4]:
df.isnull().sum()

district           0
area_m2            0
age_building       0
building_floor     0
apartment_floor    0
number_windows     0
garage             0
price              0
dtype: int64

In [5]:
df['district'].unique()

array(['Хан-Уул', 'Баянзүрх', 'Сүхбаатар', 'Баянгол', 'Сонгинохайрхан',
       'Чингэлтэй', 'Налайх'], dtype=object)

In [6]:
categorical_columns = ['district', 'garage']

# Convert categorical variables to dummy/indicator variables
df = pd.get_dummies(df, columns=categorical_columns)

In [7]:
# Separate features and target variable
X = df.drop('price', axis=1)
y = df['price']

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [8]:
X

Unnamed: 0,area_m2,age_building,building_floor,apartment_floor,number_windows,district_Баянгол,district_Баянзүрх,district_Налайх,district_Сонгинохайрхан,district_Сүхбаатар,district_Хан-Уул,district_Чингэлтэй,garage_Байгаа,garage_Байхгүй
0,104,6,9,6,4.0,False,False,False,False,False,True,False,False,True
1,60,10,12,12,2.0,False,True,False,False,False,False,False,False,True
2,106,0,4,4,4.0,False,False,False,False,True,False,False,True,False
3,74,18,5,5,4.0,False,False,False,False,True,False,False,False,True
4,80,35,9,8,4.0,True,False,False,False,False,False,False,False,True
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
535,61,6,16,14,2.0,False,True,False,False,False,False,False,True,False
536,54,0,16,8,2.0,False,True,False,False,False,False,False,True,False
537,47,9,9,6,3.0,False,False,False,True,False,False,False,False,True
538,50,28,5,2,3.0,False,False,False,False,True,False,False,False,True


#### Made a nice loop to figure out what models work the best for the Regression 

In [9]:
# Models
models = {
    'Linear Regression': LinearRegression(),
    'Lasso Regression': Lasso(),
    'Random Forest': RandomForestRegressor(n_estimators=40),
    'Gradient Boosting': GradientBoostingRegressor(),
    'Support Vector Regressor': SVR(),
    'K-Nearest Neighbors': KNeighborsRegressor()
}

# Train and evaluate each model
for name, model in models.items():
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)

    # Evaluate
    mae = mean_absolute_error(y_test, y_pred)
    mse = mean_squared_error(y_test, y_pred)
    r2 = r2_score(y_test, y_pred)

    # Print results
    print(f"Model: {name}")
    print(f"Mean Absolute Error: {mae:.2f}")
    print(f"Mean Squared Error: {mse:.2f}")
    print(f"R-squared: {r2:.2f}")
    print("------------------------")

Model: Linear Regression
Mean Absolute Error: 77871943.45
Mean Squared Error: 9008925464186604.00
R-squared: -0.06
------------------------
Model: Lasso Regression
Mean Absolute Error: 77871942.27
Mean Squared Error: 9008924970966056.00
R-squared: -0.06
------------------------
Model: Random Forest
Mean Absolute Error: 71942134.59
Mean Squared Error: 9058167263484680.00
R-squared: -0.06
------------------------
Model: Gradient Boosting
Mean Absolute Error: 80871556.63
Mean Squared Error: 11330154350072142.00
R-squared: -0.33
------------------------
Model: Support Vector Regressor
Mean Absolute Error: 79653701.42
Mean Squared Error: 9117216240024172.00
R-squared: -0.07
------------------------
Model: K-Nearest Neighbors
Mean Absolute Error: 83452037.04
Mean Squared Error: 12192270159259260.00
R-squared: -0.43
------------------------


  model = cd_fast.enet_coordinate_descent(


#### The error was just too much to say the least, but I guess that the model is at no fault, it is just the data that is lacking. I understand now that the gathering of data, as well as the cleaning and formatting of the data is most important for a ML project. 

In [10]:
df['price'].mean()

102749444.44444445

#### Saving the model in order to use it for the streamlit app that will hopefully give a price tag on a randomly inputted apartment type from the user

In [11]:
model = RandomForestRegressor(n_estimators=60)  # Replace with your actual model
model.fit(X_train, y_train)

# Save the trained model to a file
joblib.dump(model, 'random_forest_model.joblib')

['random_forest_model.joblib']