In [48]:
import numpy as np
import pandas as pd
import sklearn
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler, OneHotEncoder, OrdinalEncoder
from sklearn.compose import ColumnTransformer
from sklearn.metrics import mean_squared_error, mean_absolute_error
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor
from sklearn.tree import DecisionTreeRegressor

In [2]:
df = pd.read_csv("https://raw.githubusercontent.com/anvarnarz/praktikum_datasets/main/housing_data_08-02-2021.csv")
df.head()

Unnamed: 0,location,district,rooms,size,level,max_levels,price
0,"город Ташкент, Юнусабадский район, Юнусабад 8-...",Юнусабадский,3,57,4,4,52000
1,"город Ташкент, Яккасарайский район, 1-й тупик ...",Яккасарайский,2,52,4,5,56000
2,"город Ташкент, Чиланзарский район, Чиланзар 2-...",Чиланзарский,2,42,4,4,37000
3,"город Ташкент, Чиланзарский район, Чиланзар 9-...",Чиланзарский,3,65,1,4,49500
4,"город Ташкент, Чиланзарский район, площадь Актепа",Чиланзарский,3,70,3,5,55000


In [3]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7565 entries, 0 to 7564
Data columns (total 7 columns):
 #   Column      Non-Null Count  Dtype 
---  ------      --------------  ----- 
 0   location    7565 non-null   object
 1   district    7565 non-null   object
 2   rooms       7565 non-null   int64 
 3   size        7565 non-null   object
 4   level       7565 non-null   int64 
 5   max_levels  7565 non-null   int64 
 6   price       7565 non-null   object
dtypes: int64(3), object(4)
memory usage: 413.8+ KB


In [4]:
df['size'] = pd.to_numeric(df['size'], downcast='float', errors='coerce')
df['price'] = pd.to_numeric(df['price'], downcast='float', errors='coerce')

In [5]:
df.describe()

Unnamed: 0,rooms,size,level,max_levels,price
count,7565.0,7564.0,7565.0,7565.0,7466.0
mean,2.625644,113.26899,3.699273,6.038202,71334.21
std,1.085201,1491.312744,2.237275,2.613271,640523.0
min,1.0,1.0,1.0,1.0,2.0
25%,2.0,50.0,2.0,4.0,35000.0
50%,3.0,66.0,3.0,5.0,46500.0
75%,3.0,86.0,5.0,9.0,67000.0
max,10.0,70000.0,19.0,25.0,52000000.0


In [6]:
df = df[df['size']>=50]
df = df[df['size']<=250]
df.describe()

Unnamed: 0,rooms,size,level,max_levels,price
count,5857.0,5857.0,5857.0,5857.0,5776.0
mean,2.929486,81.009895,3.911559,6.351716,80495.99
std,0.912298,29.93924,2.323929,2.66304,726952.4
min,1.0,50.0,1.0,1.0,2.0
25%,2.0,60.0,2.0,4.0,40200.0
50%,3.0,72.0,4.0,5.0,52995.5
75%,3.0,93.0,5.0,9.0,74917.5
max,8.0,250.0,19.0,25.0,52000000.0


In [7]:
df.shape

(5857, 7)

In [8]:
df.isnull().sum()

Unnamed: 0,0
location,0
district,0
rooms,0
size,0
level,0
max_levels,0
price,81


In [9]:
new = df['location'].str.split(",", n=1, expand=True)
df['c'] = new[0]
df['r'] = new[1]
df.drop(['location', 'c'], axis=1, inplace=True)
df.head()

Unnamed: 0,district,rooms,size,level,max_levels,price,r
0,Юнусабадский,3,57.0,4,4,52000.0,"Юнусабадский район, Юнусабад 8-й квартал"
1,Яккасарайский,2,52.0,4,5,56000.0,"Яккасарайский район, 1-й тупик Шота Руставели"
3,Чиланзарский,3,65.0,1,4,49500.0,"Чиланзарский район, Чиланзар 9-й квартал"
4,Чиланзарский,3,70.0,3,5,55000.0,"Чиланзарский район, площадь Актепа"
8,Учтепинский,2,51.0,3,4,26200.0,"Учтепинский район, Чиланзар-21"


In [10]:
new = df['r'].str.split(",", n=1, expand=True)
df['loc'] = new[1]
df['t'] = new[0]
df.drop(['r', 't'], axis=1, inplace=True)
df.head()

Unnamed: 0,district,rooms,size,level,max_levels,price,loc
0,Юнусабадский,3,57.0,4,4,52000.0,Юнусабад 8-й квартал
1,Яккасарайский,2,52.0,4,5,56000.0,1-й тупик Шота Руставели
3,Чиланзарский,3,65.0,1,4,49500.0,Чиланзар 9-й квартал
4,Чиланзарский,3,70.0,3,5,55000.0,площадь Актепа
8,Учтепинский,2,51.0,3,4,26200.0,Чиланзар-21


In [11]:
df = df.iloc[:,[0,6,4,3,1,2,5]]
df.head()

Unnamed: 0,district,loc,max_levels,level,rooms,size,price
0,Юнусабадский,Юнусабад 8-й квартал,4,4,3,57.0,52000.0
1,Яккасарайский,1-й тупик Шота Руставели,5,4,2,52.0,56000.0
3,Чиланзарский,Чиланзар 9-й квартал,4,1,3,65.0,49500.0
4,Чиланзарский,площадь Актепа,5,3,3,70.0,55000.0
8,Учтепинский,Чиланзар-21,4,3,2,51.0,26200.0


In [12]:
train_set, test_set = train_test_split(df, test_size=0.1, random_state=42)
X_train = train_set.drop('price', axis=1)
y = train_set['price'].copy()
X_num = X_train.drop(['district', 'loc'], axis=1)

In [13]:
num_pipeline = Pipeline([
    ('imputer', SimpleImputer(strategy='median')),
    ("std_scaler", StandardScaler())
])

In [14]:
num_attr = list(X_num)
num_attr

['max_levels', 'level', 'rooms', 'size']

In [15]:
category_attr = ['district', 'loc']
category_attr

['district', 'loc']

In [17]:
full_pipeline = ColumnTransformer([
    ('num', num_pipeline, num_attr),
    ("category", OrdinalEncoder(), category_attr)
])

In [18]:
X_prepared = full_pipeline.fit_transform(X_train)
X_prepared[0:5,:]

array([[ 1.00435914e+00, -8.24365445e-01, -1.01904298e+00,
        -5.34047148e-01,  1.00000000e+00,  2.42000000e+02],
       [-8.82035178e-01, -8.24365445e-01, -1.01904298e+00,
        -1.03966764e+00,  9.00000000e+00,  1.01900000e+03],
       [ 1.00435914e+00,  4.62477108e-02, -1.01904298e+00,
        -8.03711408e-01,  8.00000000e+00,  1.06400000e+03],
       [ 1.00435914e+00,  2.22278060e+00, -1.01904298e+00,
         2.71574881e-01,  4.00000000e+00,  4.75000000e+02],
       [-5.04756316e-01,  4.81554289e-01,  8.11978125e-02,
        -6.35171246e-01,  5.00000000e+00,  9.20000000e+02]])

In [19]:
LR_model = LinearRegression()
RF_model = RandomForestRegressor()
Tree_model = DecisionTreeRegressor()

In [20]:
y = y.fillna(y.mean())
y.isnull().sum()

0

In [21]:
LR_model.fit(X_prepared, y)
RF_model.fit(X_prepared, y)
Tree_model.fit(X_prepared, y)

In [22]:
test_data = X_train.sample(5)
test_data

Unnamed: 0,district,loc,max_levels,level,rooms,size
6014,Мирабадский,7-й проезд Мироншаха,9,6,4,103.0
3005,Чиланзарский,Чиланзар 1-й квартал,4,4,2,64.0
7545,Чиланзарский,Чиланзар-16,4,1,3,70.0
5388,Чиланзарский,Гулистан,5,3,5,110.0
1329,Учтепинский,Чиланзар 15-й квартал,9,6,3,68.0


In [23]:
test_label = y.loc[test_data.index]
test_label

Unnamed: 0,price
6014,70000.0
3005,42200.0
7545,49800.0
5388,78000.0
1329,41500.0


In [24]:
test_data_prepared = full_pipeline.transform(test_data)
test_data_prepared

array([[ 1.00435914e+00,  9.16860866e-01,  1.18143861e+00,
         7.46858088e-01,  1.00000000e+00,  1.60000000e+02],
       [-8.82035178e-01,  4.62477108e-02, -1.01904298e+00,
        -5.67755181e-01,  6.00000000e+00,  9.03000000e+02],
       [-8.82035178e-01, -1.25967202e+00,  8.11978125e-02,
        -3.65506985e-01,  6.00000000e+00,  9.58000000e+02],
       [-5.04756316e-01, -3.89058867e-01,  2.28167940e+00,
         9.82814316e-01,  6.00000000e+00,  3.37000000e+02],
       [ 1.00435914e+00,  9.16860866e-01,  8.11978125e-02,
        -4.32923051e-01,  5.00000000e+00,  9.10000000e+02]])

In [25]:
predict_data = LR_model.predict(test_data_prepared)
predict_data

array([132219.69694746,  39796.17131283,  71851.39642596, 160936.45073421,
        48130.08341195])

In [26]:
pd.DataFrame({'Real Price':test_label, 'Predicted Price':predict_data})

Unnamed: 0,Real Price,Predicted Price
6014,70000.0,132219.696947
3005,42200.0,39796.171313
7545,49800.0,71851.396426
5388,78000.0,160936.450734
1329,41500.0,48130.083412


In [31]:
predict_data2 = RF_model.predict(test_data_prepared)
predict_data2

array([80234.89      , 43546.48571429, 50552.        , 77389.55539062,
       43016.66666667])

In [34]:
pd.DataFrame({'Real Price':test_label, 'Predicted Price':predict_data2})

Unnamed: 0,Real Price,Predicted Price
6014,70000.0,80234.89
3005,42200.0,43546.485714
7545,49800.0,50552.0
5388,78000.0,77389.555391
1329,41500.0,43016.666667


In [36]:
predict_data3 = Tree_model.predict(test_data_prepared)
predict_data3

array([70000., 43800., 49800., 78000., 41500.])

In [38]:
pd.DataFrame({'Real Price':test_label, 'Predicted Price':predict_data3})

Unnamed: 0,Real Price,Predicted Price
6014,70000.0,70000.0
3005,42200.0,43800.0
7545,49800.0,49800.0
5388,78000.0,78000.0
1329,41500.0,41500.0


In [40]:
y_test = test_set['price'].copy()
y_test

Unnamed: 0,price
6720,47000.0
4685,30299.0
1935,40000.0
38,44000.0
5180,110000.0
...,...
4437,46500.0
2080,66000.0
3439,66500.0
7123,61001.0


In [41]:
y_test.isnull().sum()

10

In [44]:
y_test = y_test.fillna(np.mean(y_test))

In [45]:
y_test.isnull().sum()

0

In [46]:
lin_mse = mean_absolute_error(y_test, predict_data)
lin_rmse = mean_squared_error(y_test, predict_data)
rmse = np.sqrt(lin_rmse)
print(lin_mse)
print(rmse)

ValueError: Found input variables with inconsistent numbers of samples: [586, 5]

In [50]:
score = cross_val_score(LR_model, X_prepared, y, scoring='neg_mean_squared_error', cv=10)
LR_rmse_scores = np.sqrt(-score)

In [51]:
def display_scores(scores):
    print("Scores:", scores)
    print("Mean:", scores.mean())
    print("Std.dev:", scores.std())

In [52]:
display_scores(LR_rmse_scores)

Scores: [  45471.03483795 2276339.86274385  240881.04799728   48720.65039315
   42108.747277    289232.77505397   42895.60539375   45624.37247805
  675318.72766768   45602.38902746]
Mean: 375219.52128701407
Std.dev: 662100.9438257014


In [53]:
score1 = cross_val_score(RF_model, X_prepared, y, scoring='neg_mean_squared_error', cv=10)
RF_rmse_scores = np.sqrt(-score)

In [54]:
display_scores(RF_rmse_scores)

Scores: [  45471.03483795 2276339.86274385  240881.04799728   48720.65039315
   42108.747277    289232.77505397   42895.60539375   45624.37247805
  675318.72766768   45602.38902746]
Mean: 375219.52128701407
Std.dev: 662100.9438257014


In [55]:
import pickle

filename = "LR_model.pkl"
with open(filename, 'wb') as file:
  pickle.dump(LR_model, file)

In [56]:
with open(filename, 'rb') as file:
  model = pickle.load(file)