# Imports

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import StandardScaler
import statsmodels.api as sm
from statsmodels.formula.api import ols
from scipy.stats import norm
from scipy import stats
from sklearn.model_selection import train_test_split
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import OneHotEncoder
from sklearn.metrics import r2_score, mean_absolute_error, mean_squared_error
from sklearn.linear_model import LinearRegression, Ridge, Lasso
from sklearn.neighbors import KNeighborsRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor, AdaBoostRegressor, ExtraTreesRegressor, VotingRegressor, StackingRegressor
from sklearn.svm import SVR
from xgboost import XGBRegressor
from sklearn.model_selection import GridSearchCV
import joblib
from src.model.log_scaler import LogScaler
from src.model.interval_voting_regressor import IntervalVotingRegressor
from config import config

# Load data and copy dataframe

In [None]:
pd.set_option('display.max_columns', 100)
pd.set_option('display.width', 1000)

initial_df = pd.read_csv(config.get_path('raw_data_csv_path'))
df = initial_df.copy(deep=True)

In [None]:
df.head()

In [None]:
df.shape

In [None]:
df.info()

# Remove unimportant column

In [None]:
df.drop('Unnamed: 0', axis=1, inplace=True)
df.head()

# Check and remove duplicates

In [None]:
df.duplicated().sum()

In [None]:
df.drop_duplicates(inplace=True)

In [None]:
df.shape

# Check null values

In [None]:
df.isnull().sum()

In [None]:
df['null_counts'] = df.isnull().sum(axis=1)
df.head()

In [None]:
for i in range(1, df['null_counts'].max() + 1):
    print("null_counts > %s = %s" % (i, len(df[df['null_counts'] > i])))

# Delete 47 rows in which most columns are null

In [None]:
df.drop(df[df['null_counts'] > 4].index, axis=0, inplace=True)
df.shape

In [None]:
df.drop(columns=['null_counts'], inplace=True)
df.shape

# Remove currency and convert prices to float

In [None]:
df['Ціна'].str.contains('грн.').sum()

In [None]:
df['Ціна'].str.contains('span').sum()

In [None]:
df['Ціна'] = df['Ціна'].str.split('грн.', expand=True)[0]
df['Ціна'] = df['Ціна'].str.replace(r'\s+', '', regex=True)
df['Ціна'] = df['Ціна'].astype('float32')

In [None]:
df.head()

In [None]:
df['Ціна'].describe()

# Check and remove outliers in the price column

In [None]:
sorted_df_by_price = df.sort_values(by=['Ціна'])
low_prices = sorted_df_by_price['Ціна'].head(10)
high_prices = sorted_df_by_price['Ціна'].tail(20)
print('outer range (low) of the distribution:')
print(low_prices)
print('\nouter range (high) of the distribution:')
print(high_prices)

In [None]:
price_scaled = StandardScaler().fit_transform(np.array(df['Ціна'])[:,np.newaxis]);
low_range = price_scaled[price_scaled[:,0].argsort()][:10]
high_range= price_scaled[price_scaled[:,0].argsort()][-20:]
print('outer range (low) of the distribution:')
print(low_range)
print('\nouter range (high) of the distribution:')
print(high_range)

In [None]:
df = df[df['Ціна'] < 100000.0]
df.shape

# Check price distribution

In [None]:
plt.figure(figsize=(15, 3))
sns.histplot(df['Ціна'])
plt.xticks(np.arange(0, df['Ціна'].max(), 10000))
plt.show()

# Convert City/District column to 2 separate columns with city and district

In [None]:
df['Локація (Місто, Район)'].str.contains(' - ').sum()

In [None]:
df['Локація (Місто, Район)'].str.contains(', ').sum()

In [None]:
def convert_district(row):
    city = row['Місто']
    district = row['Район']
    if city and district:
        return district + ' ({city})'.format(city=city)
    return district

In [None]:
df[['Місто', 'Район']] = df['Локація (Місто, Район)'].str.split(' - ', expand=True)[0].str.split(', ', expand=True)
df['Місто'] = df['Місто'].str.replace('<!-- -->', '')
df['Район'] = df['Район'].str.replace('<!-- -->', '')
df['Район'] = df.apply(convert_district, axis=1)
df.sample(10)

# Replace rarely seen cities with 'other'

In [None]:
cities_value_counts = df['Місто'].value_counts()

In [None]:
cities_value_counts.count()

In [None]:
cities_value_counts[cities_value_counts < 3].count()

In [None]:
cities_value_counts_less_than_3 = cities_value_counts[cities_value_counts < 3]
df['Місто'] = df['Місто'].apply(lambda x: 'Інше' if x in cities_value_counts_less_than_3 else x)
df['Місто'].value_counts()

# Check average rent prices by city

In [None]:
grouped = df.groupby(['Місто'])['Ціна'].aggregate('mean').reset_index().sort_values('Ціна', ascending=False)
plt.figure(figsize=(15, 5))
sns.barplot(x='Місто', y='Ціна', data=df, order=grouped['Місто'])
plt.xticks(rotation=90)
plt.show()

# Define function for ANOVA test

In [None]:
def anova_test(df, column):
    original_column = column
    if ' ' in column:
        column = column.replace(" ", "_")
        df.rename(columns={original_column: column}, inplace=True)   
        
    model = ols('Ціна ~ C('+column+')', data=df).fit()        
    anova_table = sm.stats.anova_lm(model, typ=2)

    if column != original_column:        
        df.rename(columns={column: original_column}, inplace=True)
        
    return anova_table

# ANOVA test for city column

In [None]:
anova_test(df, 'Місто')

# Fill null values in district column with 'unknown' value and also replace rarely seen values with 'unknown'

In [None]:
df[['Місто', 'Район']].isnull().sum()

In [None]:
df['Район'] = df['Район'].fillna('Невідомо')

In [None]:
disctrict_value_counts = df['Район'].value_counts()

In [None]:
disctrict_value_counts_less_than_6 = disctrict_value_counts[disctrict_value_counts < 6]
df['Район'] = df['Район'].apply(lambda x: 'Невідомо' if x in disctrict_value_counts_less_than_6 else x)
df['Район'].value_counts()

# Check average rent prices by district

In [None]:
grouped = df.groupby(['Район'])['Ціна'].aggregate('mean').reset_index().sort_values('Ціна', ascending=False)
plt.figure(figsize=(15, 5))
sns.barplot(x='Район', y='Ціна', data=df, order=grouped['Район'])
plt.xticks(rotation=90)
plt.show()

# ANOVA test for district column

In [None]:
anova_test(df, 'Район')

# Remove original City/District column

In [None]:
df.drop("Локація (Місто, Район)", axis=1, inplace=True)

# Convert 'floor' and 'floors number' columns to integer

In [None]:
df['Поверх'] = df['Поверх'].astype('int32')
df['Поверховість'] = df['Поверховість'].astype('int32')

# Check distribution for 'floor' column

In [None]:
sns.histplot(df['Поверх'])

# Check dependence 'price' from 'floor'

In [None]:
sns.barplot(x=df['Поверх'], y=df['Ціна'])
plt.xticks(rotation=90)
plt.show()

# ANOVA test for floor column

In [None]:
anova_test(df, 'Поверх')

# Check distribution for 'floors number' column

In [None]:
sns.histplot(df['Поверховість'])

# Check dependence 'price' from 'floors number'

In [None]:
sns.barplot(x=df['Поверховість'], y=df['Ціна'])
plt.xticks(rotation=90)
plt.show()

# ANOVA test for floors number column

In [None]:
anova_test(df, 'Поверховість')

# Remove units and convert to float 'Total area' and 'Kitchen area' columns

In [None]:
df['Загальна площа'].str.contains(' м²').sum()

In [None]:
df['Загальна площа'] = df['Загальна площа'].str.replace(' м²', '')
df['Загальна площа'] = df['Загальна площа'].astype('float32')

In [None]:
df['Загальна площа'].describe()

# Check distribution for 'Total area' column

In [None]:
sns.kdeplot(df['Загальна площа'])

# Check dependence 'price' from 'Total area'

In [None]:
sns.scatterplot(x=df['Загальна площа'], y=df['Ціна'])

In [None]:
df['Площа кухні'].str.contains(' м²').sum()

In [None]:
df['Площа кухні'] = df['Площа кухні'].str.replace(' м²', '')
df['Площа кухні'] = df['Площа кухні'].astype('float32')

In [None]:
df['Площа кухні'].describe()

# Check distribution for 'Kitchen area' column

In [None]:
sns.kdeplot(df['Площа кухні'])

# Check dependence 'price' from 'Kitchen area'

In [None]:
sns.scatterplot(x=df['Площа кухні'], y=df['Ціна'])

# 'Number of rooms' column seems fine, no changes required

In [None]:
df['Кількість кімнат'].value_counts()

# Check distribution for 'Number of rooms' column

In [None]:
plt.figure(figsize=(10, 5))
sns.histplot(df['Кількість кімнат'])

# Check dependence 'price' from 'Number of rooms'

In [None]:
plt.figure(figsize=(10, 5))
sns.barplot(x=df['Кількість кімнат'], y=df['Ціна'])

# ANOVA test for Number of rooms column

In [None]:
anova_test(df, 'Кількість кімнат')

In [None]:
df.head()

# Check 'Furniture' column and fill null values with 'unknown' value 

In [None]:
df['Меблювання'].value_counts()

In [None]:
df['Меблювання'].isnull().sum()

In [None]:
df['Меблювання'] = df['Меблювання'].fillna('Не вказано')

# Check distribution for 'Furniture' column

In [None]:
plt.figure(figsize=(10, 5))
sns.histplot(df['Меблювання'])

# Check dependence 'price' from 'Furniture'

In [None]:
plt.figure(figsize=(10, 5))
sns.barplot(x=df['Меблювання'], y=df['Ціна'])

In [None]:
df = df[df['Меблювання'] != 'Без меблів']

# ANOVA test for furniture column

In [None]:
anova_test(df, 'Меблювання')

In [None]:
#df.drop(columns=['Меблювання'], inplace=True)

# Check 'Repair' column, fill null values with 'unknown'

In [None]:
df['Ремонт'].value_counts()

In [None]:
df['Ремонт'].isnull().sum()

In [None]:
df['Ремонт'] = df['Ремонт'].fillna('Не вказано')

# Check distribution for 'Repair' column

In [None]:
plt.figure(figsize=(12, 5))
sns.histplot(df['Ремонт'])

# Check dependence 'price' from 'Repair'

In [None]:
plt.figure(figsize=(12, 5))
sns.barplot(x=df['Ремонт'], y=df['Ціна'])

# ANOVA test for Repair column

In [None]:
anova_test(df, 'Ремонт')

# Convert 'Pets' column to binary format

In [None]:
df['Домашні улюбленці'].value_counts()

In [None]:
df['Домашні улюбленці'].isnull().sum()

In [None]:
df['Домашні улюбленці'] = df['Домашні улюбленці'].fillna('').apply(lambda x: 1 if 'Так' in x else 0)
df['Домашні улюбленці'] = df['Домашні улюбленці'].astype('int32')

# Check distribution for 'Pets' column

In [None]:
df['Домашні улюбленці'].value_counts().plot(kind='bar')

# Check dependence 'price' from 'Pets'

In [None]:
sns.barplot(x=df['Домашні улюбленці'], y=df['Ціна'])

# ANOVA test for Pets column (result saying that there is no significant difference between groups)

In [None]:
anova_test(df, 'Домашні улюбленці')

# Select unique utilities, create binary columns for each and remove original column

In [None]:
df['Автономність при блекауті'].value_counts()

In [None]:
df['Автономність при блекауті'].isnull().sum()

In [None]:
unique_utilities = set()
for value in df['Автономність при блекауті'].unique():
    if type(value) == str:
        unique_utilities.update(value.split(', '))
for utility in unique_utilities:
    df[utility] = df['Автономність при блекауті'].apply(lambda x: 1 if (type(x) == str) and (utility in x) else 0)
df.head()

In [None]:
df.drop(columns=['Автономність при блекауті'], inplace=True)
df.head()

# Check distrbitions and influences on 'price' column from all utilities columns

In [None]:
df['Працює ліфт'].value_counts().plot(kind='bar')

In [None]:
sns.barplot(x=df['Працює ліфт'], y=df['Ціна'])

In [None]:
anova_test(df, 'Працює ліфт')

In [None]:
df['Працює водопопостачання'].value_counts().plot(kind='bar')

In [None]:
sns.barplot(x=df['Працює водопопостачання'], y=df['Ціна'])

In [None]:
anova_test(df, 'Працює водопопостачання')

In [None]:
df['Працює опалення'].value_counts().plot(kind='bar')

In [None]:
sns.barplot(x=df['Працює опалення'], y=df['Ціна'])

In [None]:
anova_test(df, 'Працює опалення')

In [None]:
df['Підключене резервне живлення'].value_counts().plot(kind='bar')

In [None]:
sns.barplot(x=df['Підключене резервне живлення'], y=df['Ціна'])

In [None]:
anova_test(df, 'Підключене резервне живлення')

In [None]:
df['Працює інтернет'].value_counts().plot(kind='bar')

In [None]:
sns.barplot(x=df['Працює інтернет'], y=df['Ціна'])

In [None]:
anova_test(df, 'Працює інтернет')

# Check correlation for 'price' with all numerical features

In [None]:
df.corr(numeric_only=True)['Ціна']

In [None]:
plt.figure(figsize=(12, 10))
sns.heatmap(df.corr(numeric_only=True), annot=True)
plt.show()

# Remove columns with weak correlation with 'price'

In [None]:
df.drop('Домашні улюбленці', axis=1, inplace=True)
df.drop(columns=['Працює інтернет', 'Працює опалення', 'Працює водопопостачання'], inplace=True)

# Take another look at the relationships between numerical columns from dataset

In [None]:
sns.set()
cols = ['Ціна', 'Поверх', 'Поверховість', 'Загальна площа', 'Площа кухні']
sns.pairplot(df[cols], height=2.5)
plt.show()

# From graph above i am notice that there is data point where 'floor' > than 'floor numbers' which make no sense. I think its fine to fix this data point by hands 

In [None]:
df[df['Поверх'] > df['Поверховість']].head()

In [None]:
df.at[172, 'Поверховість'] = df.at[172, 'Поверх']

In [None]:
df.loc[172]

# Check for duplicates once more after all data changes

In [None]:
df.duplicated().sum()

In [None]:
df.drop_duplicates(inplace=True)
df.shape

# Apply log transformation for 'price' to get more 'normal' distribution

In [None]:
sns.distplot(df['Ціна'], fit=norm);
fig = plt.figure()
stats.probplot(df['Ціна'], plot=plt)

In [None]:
df_clean = df.copy(deep=True)
df_clean.head()

In [None]:
df['Ціна'] = np.log(df['Ціна'])
df.head()

In [None]:
sns.distplot(df['Ціна'], fit=norm);
fig = plt.figure()
res = stats.probplot(df['Ціна'], plot=plt)

# Create train, test data

In [None]:
X = df.drop('Ціна', axis=1)
y = df['Ціна']
X_train,X_test,y_train,y_test = train_test_split(X, y, test_size=0.15)

# Create some helpful functions and data for testing

In [None]:
data = [{
    'Поверх': 14, 'Поверховість': 16, 'Загальна площа': 38.0, 'Площа кухні': 10.0, 'Кількість кімнат': '1 кімната', 'Меблювання': 'З меблями',
    'Ремонт': 'Косметичний ремонт', 'Місто': 'Одеса', 'Район': 'Київський (Одеса)', 'Підключене резервне живлення': 0, 'Працює ліфт': 0
},
{
    'Поверх': 20, 'Поверховість': 25, 'Загальна площа': 48.0, 'Площа кухні': 20.0, 'Кількість кімнат': '2 кімнати', 'Меблювання': 'З меблями',
    'Ремонт': 'Євроремонт', 'Місто': 'Одеса', 'Район': 'Приморський (Одеса)', 'Підключене резервне живлення': 1, 'Працює ліфт': 0
},
{
    'Поверх': 2, 'Поверховість': 5, 'Загальна площа': 20.0, 'Площа кухні': 7.0, 'Кількість кімнат': '1 кімната', 'Меблювання': 'З меблями',
    'Ремонт': 'Косметичний ремонт', 'Місто': 'Запоріжжя', 'Район': 'Невідомо', 'Підключене резервне живлення': 0, 'Працює ліфт': 0
},
{
    'Поверх': 30, 'Поверховість': 30, 'Загальна площа': 250.0, 'Площа кухні': 25.0, 'Кількість кімнат': '5+ кімнат', 'Меблювання': 'З меблями',
    'Ремонт': 'Авторський проект', 'Місто': 'Київ', 'Район': 'Печерський (Київ)', 'Підключене резервне живлення': 1, 'Працює ліфт': 1
},
{
    'Поверх': 8, 'Поверховість': 9, 'Загальна площа': 38.0, 'Площа кухні': 10.0, 'Кількість кімнат': '1 кімната', 'Меблювання': 'З меблями',
    'Ремонт': 'Євроремонт', 'Місто': 'Львів', 'Район': 'Сихівський (Львів)', 'Підключене резервне живлення': 0, 'Працює ліфт': 0
},
{
    'Поверх': 2, 'Поверховість': 5, 'Загальна площа': 41.0, 'Площа кухні': 20.0, 'Кількість кімнат': '1 кімната', 'Меблювання': 'З меблями',
    'Ремонт': 'Авторський проект', 'Місто': 'Одеса', 'Район': 'Київський (Одеса)', 'Підключене резервне живлення': 0, 'Працює ліфт': 0
},
{
    'Поверх': 1, 'Поверховість': 9, 'Загальна площа': 46.0, 'Площа кухні': 9.0, 'Кількість кімнат': '2 кімнати', 'Меблювання': 'З меблями',
    'Ремонт': 'Житловий стан', 'Місто': 'Миколаїв', 'Район': 'Інгульський (Миколаїв)', 'Підключене резервне живлення': 0, 'Працює ліфт': 0
}]
manual_test_df = pd.DataFrame(data, index=[1,2,3,4,5,6,7])
manual_test_df.head(10)

In [None]:
def run_manual_test(pipes_dict, manual_test_df):
    manual_test_df_copy = manual_test_df.copy()
    for name, pipe in pipes_dict.items():
        if name in ['linear', 'Voting', 'Stacking']:
            manual_test_df_copy['pred_price_' + name.title()] = np.exp(pipe.predict(manual_test_df_copy))
        else:
            manual_test_df_copy['pred_price_' + name.title()] = np.exp(pipe.best_estimator_.predict(manual_test_df_copy))
    return manual_test_df_copy.head(10)

In [None]:
def print_x_test_with_pred_prices(X_test, y_test, y_pred_dict):
    check_df = X_test.copy()
    for name, y_pred in y_pred_dict.items():
        check_df['pred_price_' + name.title()] = np.exp(y_pred)
    check_df['price'] = np.exp(y_test)
    return check_df.sample(30, random_state=1)

In [None]:
def scatterplots_for_pred_test(y_test, y_pred_dict):
    for name, y_pred in y_pred_dict.items():
        plt.figure(figsize=(8,7))
        plt.scatter(np.exp(y_test), np.exp(y_pred))
        plt.title(name.title())
        plt.xlabel('y_test')
        plt.ylabel('y_pred')
        plt.plot(range(0, 100000), range(0, 100000), c='r')
        plt.show()

In [None]:
def error_for_pred_test(y_test, y_pred_dict):
    for name, y_pred in y_pred_dict.items():
        plt.figure(figsize=(12,10))
        plt.scatter(np.exp(y_test), np.abs(np.exp(y_test) - np.exp(y_pred)))
        plt.title(name.title())
        plt.xlabel('y_test')
        plt.ylabel('abs(y_test - y_pred)')
        plt.show()

In [None]:
def print_metrics(y_test, y_pred_dict, pipes_dict):
    bold_start = '\033[1m'
    bold_end = '\033[0m'
    for name, y_pred in y_pred_dict.items():
        print('{bold_start}Results for {model_name}:{bold_end}'.format(bold_start=bold_start, model_name=name.title(), bold_end=bold_end))
        if name not in ['linear', 'Voting', 'Stacking']:
            print("Best parameters:", pipes_dict[name].best_params_)
            print("Best cross-validation score:", pipes_dict[name].best_score_)
        print('R2 score', r2_score(y_test, y_pred))
        print('MAE', mean_absolute_error(y_test, y_pred))
        print('MSE', mean_squared_error(y_test, y_pred))
        print('--------------------------------------')

In [None]:
pipes_dict = {}
y_pred_dict = {}

In [None]:
df.head()

# Create column transformer for One hot encoding categorical columns and scaling numerical columns

In [None]:
col_tnf = ColumnTransformer(transformers=[
    ('category_tnf', OneHotEncoder(handle_unknown='ignore'), [4,5,6,7,8]),
    #('stand_scal_tnf', StandardScaler(), [0,1,2,3])
    ('log_scaler_tnf', LogScaler(), [0,1,2,3])
], remainder='passthrough')

# Create linear regression model

In [None]:
model = LinearRegression()

pipe = Pipeline([
    ('col_tnf', col_tnf),
    ('model', model)
])

pipe.fit(X_train, y_train)

y_pred = pipe.predict(X_test)

pipes_dict['linear'] = pipe
y_pred_dict['linear'] = y_pred

# Create ridge regression model

In [None]:
model = Ridge()

pipe = Pipeline([
    ('col_tnf', col_tnf),
    ('model', model)
])

param_grid = [{
    'model__alpha': [0.5, 1]
}]

grid_pipeline = GridSearchCV(pipe, param_grid, cv=5)

grid_pipeline.fit(X_train, y_train)

y_pred = grid_pipeline.best_estimator_.predict(X_test)

pipes_dict['ridge'] = grid_pipeline
y_pred_dict['ridge'] = y_pred

# Create lasso regression model

In [None]:
model = Lasso()

pipe = Pipeline([
    ('col_tnf', col_tnf),
    ('model', model)
])

param_grid = [{
    'model__alpha': [0.0001, 0.001, 0.01]
}]

grid_pipeline = GridSearchCV(pipe, param_grid, cv=5)

grid_pipeline.fit(X_train, y_train)

y_pred = grid_pipeline.best_estimator_.predict(X_test)

pipes_dict['lasso'] = grid_pipeline
y_pred_dict['lasso'] = y_pred

# Create k neighbors regression model

In [None]:
model = KNeighborsRegressor()

pipe = Pipeline([
    ('col_tnf', col_tnf),
    ('model', model)
])

param_grid = [{
    'model__n_neighbors': [3, 5, 7]
}]

grid_pipeline = GridSearchCV(pipe, param_grid, cv=5)

grid_pipeline.fit(X_train, y_train)

y_pred = grid_pipeline.best_estimator_.predict(X_test)

pipes_dict['kneighbors'] = grid_pipeline
y_pred_dict['kneighbors'] = y_pred

# Create Decision tree model

In [None]:
model = DecisionTreeRegressor()

pipe = Pipeline([
    ('col_tnf', col_tnf),
    ('model', model)
])

param_grid = [{
    'model__max_depth': [5, 7]
}]

grid_pipeline = GridSearchCV(pipe, param_grid, cv=5)

grid_pipeline.fit(X_train, y_train)

y_pred = grid_pipeline.best_estimator_.predict(X_test)

pipes_dict['DecisionTree'] = grid_pipeline
y_pred_dict['DecisionTree'] = y_pred

# Create SVR model

In [None]:
model = SVR(kernel='rbf')

pipe = Pipeline([
    ('col_tnf', col_tnf),
    ('model', model)
])

param_grid = [{
    'model__C': [1000, 10000],
    'model__epsilon': [0.1, 1.0]
}]

grid_pipeline = GridSearchCV(pipe, param_grid, cv=5)

grid_pipeline.fit(X_train, y_train)

y_pred = grid_pipeline.best_estimator_.predict(X_test)

pipes_dict['SVR'] = grid_pipeline
y_pred_dict['SVR'] = y_pred

# Create Random Forest model

In [None]:
model = RandomForestRegressor()

pipe = Pipeline([
    ('col_tnf', col_tnf),
    ('model', model)
])

param_grid = [{
    'model__n_estimators': [200, 500]
}]

grid_pipeline = GridSearchCV(pipe, param_grid, cv=5)

grid_pipeline.fit(X_train, y_train)

y_pred = grid_pipeline.best_estimator_.predict(X_test)

pipes_dict['RandomForest'] = grid_pipeline
y_pred_dict['RandomForest'] = y_pred

# Create Extra trees regression model

In [None]:
model = ExtraTreesRegressor()

pipe = Pipeline([
    ('col_tnf', col_tnf),
    ('model', model)
])

param_grid = [{
    'model__n_estimators': [300, 500]
}]

grid_pipeline = GridSearchCV(pipe, param_grid, cv=5)

grid_pipeline.fit(X_train, y_train)

y_pred = grid_pipeline.best_estimator_.predict(X_test)

pipes_dict['ExtraTrees'] = grid_pipeline
y_pred_dict['ExtraTrees'] = y_pred

# Create AdaBoost regression model

In [None]:
model = AdaBoostRegressor()

pipe = Pipeline([
    ('col_tnf', col_tnf),
    ('model', model)
])

param_grid = [{
    'model__n_estimators': [30, 50, 70],
    'model__learning_rate': [0.1, 0.5]
}]

grid_pipeline = GridSearchCV(pipe, param_grid, cv=5)

grid_pipeline.fit(X_train, y_train)

y_pred = grid_pipeline.best_estimator_.predict(X_test)

pipes_dict['AdaBoost'] = grid_pipeline
y_pred_dict['AdaBoost'] = y_pred

# Create Gradient boosting regression model

In [None]:
model = GradientBoostingRegressor()

pipe = Pipeline([
    ('col_tnf', col_tnf),
    ('model', model)
])

param_grid = [{
    'model__n_estimators': [100, 200, 300],
}]

grid_pipeline = GridSearchCV(pipe, param_grid, cv=5)

grid_pipeline.fit(X_train, y_train)

y_pred = grid_pipeline.best_estimator_.predict(X_test)

pipes_dict['GradBoost'] = grid_pipeline
y_pred_dict['GradBoost'] = y_pred

# Create XGBoost regression model

In [None]:
model = GradientBoostingRegressor()

pipe = Pipeline([
    ('col_tnf', col_tnf),
    ('model', model)
])

param_grid = [{
    'model__n_estimators': [300, 500],
    'model__max_depth': [2, 5],
    'model__learning_rate': [0.01, 0.1],
}]

grid_pipeline = GridSearchCV(pipe, param_grid, cv=5)

grid_pipeline.fit(X_train, y_train)

y_pred = grid_pipeline.best_estimator_.predict(X_test)

pipes_dict['XGBoost'] = grid_pipeline
y_pred_dict['XGBoost'] = y_pred

# Create Voting regression model

In [None]:
estimators = [
    ('lr', LinearRegression()),
    ('ls', Lasso(alpha=0.001)),
    ('et', ExtraTreesRegressor(n_estimators=500)),
    ('gb', GradientBoostingRegressor(max_depth=2, n_estimators=500)),
    ('xgb', XGBRegressor(max_depth=5, learning_rate=0.1, n_estimators=200))
]

model = VotingRegressor(estimators=estimators)

pipe_voting = Pipeline([
    ('col_tnf', col_tnf),
    ('model', model)
])

pipe_voting.fit(X_train, y_train)

y_pred = pipe_voting.predict(X_test)

pipes_dict['Voting'] = pipe_voting
y_pred_dict['Voting'] = y_pred

# Create Stacking regression model

In [None]:
estimators = [
    ('lr', LinearRegression()),
    ('ls', Lasso(alpha=0.001)),
    ('et', ExtraTreesRegressor(n_estimators=500)),
    ('gb', GradientBoostingRegressor(n_estimators=300)),
    ('xgb', XGBRegressor(n_estimators=500, max_depth=2, learning_rate=0.1))
]

model = StackingRegressor(estimators=estimators, final_estimator=Ridge(alpha=1))

pipe_stacking = Pipeline([
    ('col_tnf', col_tnf),
    ('model', model)
])

pipe_stacking.fit(X_train, y_train)

y_pred = pipe_stacking.predict(X_test)

pipes_dict['Stacking'] = pipe_stacking
y_pred_dict['Stacking'] = y_pred

# Print results

In [None]:
print_metrics(y_test, y_pred_dict, pipes_dict)

In [None]:
run_manual_test(pipes_dict, manual_test_df)

In [None]:
print_x_test_with_pred_prices(X_test, y_test, y_pred_dict)

In [None]:
scatterplots_for_pred_test(y_test, y_pred_dict)

In [None]:
error_for_pred_test(y_test, y_pred_dict)

# Helpful functions for interval models

In [None]:
def print_x_test_with_pred_price_interval(X_test, y_test, lower_bounds, point_predictions, upper_bounds):
    test_df = X_test.copy()
    test_df['price'] = np.exp(y_test)
    test_df['pred_price_low'] = np.exp(lower_bounds)
    test_df['pred_price'] = np.exp(point_predictions)
    test_df['pred_price_up'] = np.exp(upper_bounds)
    return test_df.sample(30, random_state=1)

In [None]:
def run_manual_test_interval(pipe, manual_test_df):
    manual_test_df_copy = manual_test_df.copy()
    predicted = pipe.predict(manual_test_df_copy)
    lower_bound, y_pred, upper_bound = predicted[:, 0], predicted[:, 1], predicted[:, 2]
    manual_test_df_copy['lower_pred_price'] = np.exp(lower_bound)
    manual_test_df_copy['pred_price'] = np.exp(y_pred)
    manual_test_df_copy['upper_pred_price'] = np.exp(upper_bound)
    return manual_test_df_copy.head(10)

In [None]:
def show_difference_in_predicted_intervals_and_actual_price(y_test, lower_bounds, point_predictions, upper_bounds):
    y_test_sorted = y_test.sort_values()
    lower_bounds_sorted = pd.Series(lower_bounds, index=y_test.index).reindex(y_test_sorted.index)
    point_predictions_sorted = pd.Series(point_predictions, index=y_test.index).reindex(y_test_sorted.index)
    upper_bounds_sorted = pd.Series(upper_bounds, index=y_test.index).reindex(y_test_sorted.index)
    plt.figure(figsize=(20,10))
    plt.title('Prices difference')
    plt.xlabel('sorted test obserbations')
    plt.ylabel('prices')
    plt.plot(range(0, y_test_sorted.shape[0]), np.exp(y_test_sorted), c='blue', label='Actual price')
    plt.plot(range(0, y_test_sorted.shape[0]), np.exp(lower_bounds_sorted), c='yellow', label='Low limit for pred price')
    plt.plot(range(0, y_test_sorted.shape[0]), np.exp(point_predictions_sorted), c='orange', label='Pred price')
    plt.plot(range(0, y_test_sorted.shape[0]), np.exp(upper_bounds_sorted), c='red', label='Upper limit for pred price')
    plt.legend()
    plt.show()

In [None]:
def average_diff(lower_bounds, upper_bounds):
    return (np.exp(upper_bounds) - np.exp(lower_bounds)).mean()

# Loss function for interval model

In [None]:
from sklearn.metrics import make_scorer

def interval_accuracy(y_true, y_pred):
    y_pred_lower, y_pred_upper = y_pred[:, 0], y_pred[:, 2]
    y_pred_lower = pd.Series(y_pred_lower, index=y_true.index)
    y_pred_upper = pd.Series(y_pred_upper, index=y_true.index)
    in_interval = ((y_pred_lower < y_true) & (y_true < y_pred_upper)).sum()
    return in_interval/y_true.shape[0]

interval_accuracy_scorer = make_scorer(interval_accuracy, greater_is_better=True)

# Create Voting regressor adapted to predict interval

In [None]:
"""
estimators = [
    #('lr', LinearRegression()),
    ('ls', Lasso(alpha=0.001)),
    ('et', ExtraTreesRegressor(n_estimators=500)),
    ('gb', GradientBoostingRegressor(max_depth=5)),
    ('xgb', XGBRegressor(max_depth=2))
]

interval_voter = IntervalVotingRegressor(estimators=estimators, interval_width=0.9)

pipe = Pipeline([
    ('col_tnf', col_tnf),
    ('model', interval_voter)
])

param_grid = [{
    'model__gb__n_estimators': [100, 200],
    'model__gb__learning_rate': [0.1, 0.5],
    'model__xgb__n_estimators': [100, 200],
    'model__xgb__learning_rate': [0.5, 0.1],
}]

grid_pipeline = GridSearchCV(pipe, param_grid, scoring=interval_accuracy_scorer, cv=5, verbose=3)

grid_pipeline.fit(X_train, y_train)

prediction = grid_pipeline.best_estimator_.predict(X_test)
lower_bound, y_pred, upper_bound = prediction[:, 0], prediction[:, 1], prediction[:, 2]
print("Best parameters:", grid_pipeline.best_params_)
print("Best cross-validation score:", grid_pipeline.best_score_)
"""

estimators = [
    ('lr', LinearRegression()),
    ('ls', Lasso(alpha=0.005)),
    ('et', ExtraTreesRegressor(n_estimators=500)),
    ('gb', GradientBoostingRegressor(n_estimators=75)),
    ('xgb', XGBRegressor(n_estimators=75))
]

interval_voter = IntervalVotingRegressor(estimators=estimators, weights=[1,1,1,1,1], interval_width=0.8)

pipe = Pipeline([
    ('col_tnf', col_tnf),
    ('model', interval_voter)
])

pipe.fit(X_train, y_train)

prediction = pipe.predict(X_test)
lower_bound, y_pred, upper_bound = prediction[:, 0], prediction[:, 1], prediction[:, 2]

# Results for Interval Voting Regressor

In [None]:
print('Interval accuracy: ', interval_accuracy(y_test, prediction))

In [None]:
print('R2 score', r2_score(y_test, y_pred))
print('MAE', mean_absolute_error(y_test, y_pred))
print('MSE', mean_squared_error(y_test, y_pred))

In [None]:
average_diff(lower_bound, upper_bound)

In [None]:
average_diff(lower_bound, upper_bound)/np.exp(y_test).mean()

In [None]:
average_diff(lower_bound, upper_bound)/np.exp(y_test).median()

In [None]:
average_diff(lower_bound, upper_bound)/np.exp(y_test).std()

In [None]:
run_manual_test_interval(pipe, manual_test_df)

In [None]:
print_x_test_with_pred_price_interval(X_test, y_test, lower_bound, y_pred, upper_bound)

In [None]:
show_difference_in_predicted_intervals_and_actual_price(y_test, lower_bound, y_pred, upper_bound)

# Save VotingRegression model. We are not gonna use IntervalVotingRegression model because of not very good accuracy

In [None]:
with open(config.get_path('model_path'), 'wb') as file:
    joblib.dump(pipes_dict['Voting'], file)

# Save clean dataframe

In [None]:
with open(config.get_path('clean_data_path'), 'wb') as file:
    joblib.dump(df_clean, file)

df_clean.to_csv(config.get_path('clean_data_csv_path'), sep=',', index=False)