In [1]:
# Tashkent house prices
import numpy as np
import pandas as pd
import sklearn
URL = 'https://raw.githubusercontent.com/anvarnarz/praktikum_datasets/main/housing_data_08-02-2021.csv'
df = pd.read_csv(URL)
df.head()

Unnamed: 0,location,district,rooms,size,level,max_levels,price
0,"город Ташкент, Юнусабадский район, Юнусабад 8-...",Юнусабадский,3,57,4,4,52000
1,"город Ташкент, Яккасарайский район, 1-й тупик ...",Яккасарайский,2,52,4,5,56000
2,"город Ташкент, Чиланзарский район, Чиланзар 2-...",Чиланзарский,2,42,4,4,37000
3,"город Ташкент, Чиланзарский район, Чиланзар 9-...",Чиланзарский,3,65,1,4,49500
4,"город Ташкент, Чиланзарский район, площадь Актепа",Чиланзарский,3,70,3,5,55000


In [2]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7565 entries, 0 to 7564
Data columns (total 7 columns):
 #   Column      Non-Null Count  Dtype 
---  ------      --------------  ----- 
 0   location    7565 non-null   object
 1   district    7565 non-null   object
 2   rooms       7565 non-null   int64 
 3   size        7565 non-null   object
 4   level       7565 non-null   int64 
 5   max_levels  7565 non-null   int64 
 6   price       7565 non-null   object
dtypes: int64(3), object(4)
memory usage: 413.8+ KB


In [3]:
# Data Cleaning
df = df.drop((df[df['price'] == 'Договорная']).index, axis = 0)
df = df.drop('location', axis = 1)
df.loc[df[df['size'] == 'Площадьземли:1сот'].index, 'size'] = 100
df['price'] = df['price'].astype(int)
df['size'] = df['size'].astype(float)
df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 7466 entries, 0 to 7564
Data columns (total 6 columns):
 #   Column      Non-Null Count  Dtype  
---  ------      --------------  -----  
 0   district    7466 non-null   object 
 1   rooms       7466 non-null   int64  
 2   size        7466 non-null   float64
 3   level       7466 non-null   int64  
 4   max_levels  7466 non-null   int64  
 5   price       7466 non-null   int64  
dtypes: float64(1), int64(4), object(1)
memory usage: 666.3+ KB


In [4]:
# train test split
from sklearn.model_selection import train_test_split
train_set, test_set = train_test_split(df, test_size = 0.2, random_state = 42)

X_train = train_set.drop('price', axis = 1)
Y = train_set['price'].copy()

X_num = X_train.drop('district', axis = 1)

In [5]:
# Pipeline
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler, OneHotEncoder

num_pipeline = Pipeline([
    ('imputer', SimpleImputer(strategy = 'median')),
    ('std_scaler', StandardScaler())
])

In [6]:
from sklearn.compose import ColumnTransformer
num_attribs = list(X_num)
cat_attribs = ['district']

full_pipeline = ColumnTransformer([
    ('num', num_pipeline, num_attribs),
    ('cat', OneHotEncoder(), cat_attribs)
])

In [7]:
X_prepaired = full_pipeline.fit_transform(X_train)

In [8]:
X_prepaired

array([[-1.49142306, -0.05731241, -1.20598257, ...,  0.        ,
         0.        ,  0.        ],
       [ 0.34996179, -0.02540552, -0.76132443, ...,  0.        ,
         0.        ,  0.        ],
       [-0.57073064, -0.04880391, -1.20598257, ...,  0.        ,
         0.        ,  0.        ],
       ...,
       [ 2.19134664,  0.10151299, -1.20598257, ...,  0.        ,
         0.        ,  0.        ],
       [-0.57073064, -0.0452587 ,  0.12799185, ...,  0.        ,
         0.        ,  0.        ],
       [-0.57073064, -0.03604115,  0.12799185, ...,  0.        ,
         0.        ,  0.        ]])

In [9]:
# Machine learning (Linear Regression)
from sklearn.linear_model import LinearRegression
LR_model = LinearRegression()
LR_model.fit(X_prepaired, Y)

In [11]:
# Testing the model
test_data = X_train.sample(10)
test_label = Y.loc[test_data.index]
test_data_prepaired = full_pipeline.transform(test_data)

In [13]:
predicted_data = LR_model.predict(test_data_prepaired)
predicted_data

array([ 99854.71500387,  43004.6832289 ,  43003.66712853,  16873.17191308,
        69328.47344632,  86725.63020583,  70359.14554876,  43928.886712  ,
       110017.48193586, 113602.85124524])

In [14]:
pd.DataFrame({'Actual': test_label, 'Predicted': predicted_data})

Unnamed: 0,Actual,Predicted
3830,59500,99854.715004
7179,37000,43004.683229
2566,30000,43003.667129
3329,23000,16873.171913
2019,20800,69328.473446
1365,57000,86725.630206
1910,64700,70359.145549
5657,29500,43928.886712
2468,63000,110017.481936
5950,87000,113602.851245


In [17]:
# Evaluating the model
X_test = test_set.drop('price', axis = 1)
Y_test = test_set['price'].copy()
X_test_prepaired = full_pipeline.transform(X_test)

In [18]:
y_predicted = LR_model.predict(X_test_prepaired)
y_predicted

array([72408.53404716, 70405.34939977, 73649.67103971, ...,
       90716.34987584, 94562.6528502 , 49952.94579234])

In [19]:
# Checking mean absolute error
from sklearn.metrics import mean_absolute_error
mae = mean_absolute_error(Y_test,y_predicted)
mae

60350.558414148836

In [21]:
# Random Forest
from sklearn.ensemble import RandomForestRegressor
RF_model = RandomForestRegressor()
RF_model.fit(X_prepaired, Y)

In [22]:
y_predicted = RF_model.predict(X_test_prepaired)

In [23]:
mae = mean_absolute_error(Y_test,y_predicted)
mae

55821.65963720629

In [24]:
# Cross Validation
X = df.drop("price", axis = 1)
y = df['price'].copy()

X_prepaired = full_pipeline.transform(X)

In [25]:
from sklearn.model_selection import cross_val_score

mse_scores = cross_val_score(RF_model, X_prepaired, y, scoring = "neg_mean_squared_error", cv = 5)

In [26]:
def display_scores(scores):
  print("Scores: ", scores)
  print("Mean: ", scores.mean())
  print("Std.dev: ", scores.std())
display_scores(np.sqrt(-mse_scores))

Scores:  [ 146220.72504576  198495.93686959  163388.58298278 1348339.92794643
 1351404.11019691]
Mean:  641569.8566082949
Std.dev:  578572.5870183491


In [27]:
#Presenting Model
import pickle

filename = 'RF_model.sav'
pickle.dump(RF_model, open(filename, 'wb'))

In [29]:
with open(filename, 'rb') as file:
  model = pickle.load(file)

In [30]:
import joblib
joblib.dump(LR_model, 'LR_model.jlb')

['LR_model.jlb']

In [31]:
joblib.dump(RF_model, 'RF_model.jlb')

['RF_model.jlb']