In [1]:
%load_ext autoreload

In [3]:
%autoreload
 
import numpy as np 
import pandas as pd 
import matplotlib.pyplot as plt 
import seaborn as sns 
from sklearn.model_selection import train_test_split
from Visualizer import Visualizer
from Preprocessor import Preprocessor
from Model import Model
from TestModel import TestModel
from flaml import AutoML

sns.set_style('darkgrid')
pd.set_option("display.max_columns", None)

ModuleNotFoundError: No module named 'flaml'

In [None]:
preprocessor = Preprocessor()
apartments = preprocessor.apartments


In [None]:
apartments_test = preprocessor.apartments_test
buildings = preprocessor.buildings
buildings_test = preprocessor.buildings_test
merged = preprocessor.merged.copy()
merged_test = preprocessor.merged_test.copy()
labels = merged["price"]
merged_final = pd.concat([merged, merged_test], ignore_index=True)
print(f"The dataframe now has {len(merged_final)} entries")

In [None]:
visualizer = Visualizer()
visualizer.list_missing(merged_final.copy())

In [None]:
general_removed = preprocessor.general_removal(merged_final.copy())
data = preprocessor.remove_NaNs(general_removed.copy())
visualizer.list_missing(data)

In [None]:
visualizer.price_correlation(data)

In [None]:
data = preprocessor.logify(data.copy(), "price")
data = preprocessor.logify(data.copy(), "area_total")
data = preprocessor.logify(data.copy(), "area_living")
data = preprocessor.remove_zero_values(data.copy(), "area_living")
data = preprocessor.logify(data.copy(), "area_kitchen")
data.shape

In [None]:
is_outlier = (data["longitude"] > 55) | (data["latitude"] < 54)
outliers = data.copy()[is_outlier]
removed_outliers = preprocessor.fix_latlon_outliers(data.copy(), outliers)

In [None]:
featured = preprocessor.combine_area_rooms(removed_outliers.copy())

featured = preprocessor.remove_labels(featured.copy(), ["area_living", "area_kitchen", "rooms"])

featured = preprocessor.combine_baths(featured.copy())
featured = preprocessor.remove_labels(featured.copy(), labels=["bathrooms_private", "bathrooms_shared"])

featured = preprocessor.combine_windows(featured.copy())
featured = preprocessor.combine_windows(featured.copy(), boolean=True)
featured = preprocessor.combine_latlon(featured.copy())

featured = preprocessor.remove_labels(featured.copy(), ["id", "building_id"])

visualizer.price_correlation(featured)

featured = preprocessor.remove_labels(featured.copy(), ["garbage_chute", "phones", "has_windows", "windows_street", "windows_court"])

featured = preprocessor.combine_elevators(featured.copy())
featured = preprocessor.remove_labels(featured.copy(), ["elevator_passenger", "elevator_service"])
featured = preprocessor.redo_new(featured.copy())

featured = preprocessor.combine_new_constructed_distance(featured.copy())
featured = preprocessor.remove_labels(featured.copy(), ["constructed", "new"])

featured = preprocessor.combine_floor_stories(featured.copy())
featured = preprocessor.remove_labels(featured.copy(), ["floor", "stories"])

featured = preprocessor.combine_district_city_center(featured.copy())
#featured = preprocessor.remove_labels(featured.copy(),["district","distance_enter"])

featured = preprocessor.combine_area_total_city_center(featured.copy())
featured = preprocessor.logify(featured.copy(),"area_total_distance")

featured = preprocessor.logify(featured.copy(),"scaled_constructed")
featured = preprocessor.logify(featured.copy(),"distance_center")
featured = preprocessor.logify(featured.copy(),"lat")
featured = preprocessor.logify(featured.copy(),"long")
featured = preprocessor.logify(featured.copy(),"distance_center")

categorical = preprocessor.split_categorical_features(featured.copy(), ["seller", "district","material", "has_elevator"])
visualizer.price_correlation(featured)
featured.head()

In [None]:
# Seperating training and test data
train_data = categorical[:23285].copy()
labels = train_data["price"]
test_data = categorical[23285:].copy()
test_data.drop("price", 1, inplace=True)
print("Split data into train and test")

# Splitting training data into training and validation, removed the price for each of them afterwards
x_train, x_test, y_train, y_test = train_test_split(train_data, labels, stratify=train_data.price.round(), test_size=0.001)
x_train.drop("price", 1, inplace=True)
x_test.drop("price", 1, inplace=True)
print("Validation data created and price dropped")
print("The number of features are now:", x_train.shape[1])

print("Fitting model...")
model = TestModel(x_train, y_train)
pipeline = model.fit()
test_pred = np.round([np.expm1(pred) for pred in model.predict(x_test)])
test_labels = np.round([np.expm1(lab)for lab in y_test])
print("Fitting complete")

res = pd.DataFrame([(test_labels[i], test_pred[i]) for i in range(len(test_pred))], columns=["actual", "prediction"])
print("RMLSE: %s" % model.root_mean_squared_log_error(test_labels, test_pred))
res.to_csv("./split.csv", index=False)

pred = [np.expm1(p) for p in model.predict(test_data)]
model.save_predictions(pred)

In [None]:
automl_model = TestModel(x_train, y_train)
automl_model.autoMLfit(x_train, y_train, ["lgbm","xgboost","catboost",'rf', 'extra_tree'],time=10000,ensemble=False)

In [None]:
test_pred = np.round([np.expm1(pred) for pred in automl_model.autoMLpredict(x_test)])
test_labels = np.round([np.expm1(lab)for lab in y_test])
print("Fitting complete")

res = pd.DataFrame([(test_labels[i], test_pred[i]) for i in range(len(test_pred))], columns=["actual", "prediction"])
print("RMLSE: %s" % automl_model.root_mean_squared_log_error(test_labels, test_pred))
res.to_csv("./split.csv", index=False)

pred = [np.expm1(p) for p in automl_model.predict(test_data)]
automl_model.save_predictions(pred)
automl_model.autoML_print_best_model()