## Load Libraries and data

In [34]:
import numpy as np
import pandas as pd
import pickle
import re
from sklearn.compose import ColumnTransformer, make_column_transformer
from sklearn.impute import SimpleImputer
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, mean_absolute_error
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder
from sklearn.pipeline import Pipeline, make_pipeline
from sklearn.tree import DecisionTreeRegressor
import warnings
warnings.filterwarnings("ignore")

In [35]:
data = pd.read_csv("data2.csv")

In [36]:
model_data=data[['neighbourhood','Area','Number_of_rooms',
     'Build_year','Floor', 'Nearest_educational_institution',
     'Nearest_shop','Public_transport_stop','Heating_system', 'energy_class','Building_type','No_of_floors']]

## Clean data

In [37]:
data['Price_per_month']=data['Price_per_month'].apply(lambda x : float(x.replace(" ", "")))
model_data['Build_year'] = model_data['Build_year'].apply(lambda x: int(re.sub("[^0-9]","",x.split(",")[0])))

## Create Pipeline and model

In [38]:
num_attribs = model_data._get_numeric_data().columns.to_list()
cat_attribs = model_data.select_dtypes(include='O').columns.to_list()
num_pipeline = Pipeline(steps=[('imputer', SimpleImputer(strategy="median"))])
cat_pipeline = Pipeline(steps=[('imputer',SimpleImputer(strategy="most_frequent")),('encoder',OneHotEncoder())])
preprocess = ColumnTransformer([
 ("num", num_pipeline, num_attribs),
 ("cat", cat_pipeline, cat_attribs),
 ])

In [39]:
model = make_pipeline(preprocess, LinearRegression())

In [40]:
y = data['Price_per_month']
X = model_data

## Fit data into model and preprocess pipeline

In [41]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=42)


In [42]:
model.fit(X_train,y_train)

Pipeline(steps=[('columntransformer',
                 ColumnTransformer(transformers=[('num',
                                                  Pipeline(steps=[('imputer',
                                                                   SimpleImputer(strategy='median'))]),
                                                  ['Area', 'Number_of_rooms',
                                                   'Build_year', 'Floor',
                                                   'Nearest_educational_institution',
                                                   'Nearest_shop',
                                                   'Public_transport_stop',
                                                   'No_of_floors']),
                                                 ('cat',
                                                  Pipeline(steps=[('imputer',
                                                                   SimpleImputer(strategy='most_frequent')),
                             

## Save model

In [45]:
with open("model.pkl", "wb") as f:
    pickle.dump(model, f)

## Test model

In [44]:
model.predict(pd.DataFrame([{"neighbourhood": "Šnipiškės",
 "Area":  71.78,
 "Number_of_rooms":  2,
 "Build_year":  1940,
"Floor":  1,
 "Nearest_educational_institution": 310.0,
 "Nearest_shop": 170.0,
 "Public_transport_stop": 80.0,
 "Heating_system": "Electric",
 "energy_class":  "1 Class Good (1-3)",
 "Building_type": "Brick",
 "No_of_floors": 3}]))

array([896.61232372])