# Regression Party - Neural Network
Denna notebook täcker en analys av datan i hemnet_data_clean.csv samt en regression med hjälp av maskininlärning.

In [64]:
from math import sqrt

import pandas as pd
import plotly.graph_objects as go
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
from sklearn.model_selection import GridSearchCV, train_test_split
from sklearn.neural_network import MLPRegressor

In [22]:
hemnet_data = pd.read_csv('hemnet_data_clean.csv')

In [23]:
def prepare_data(df: pd.DataFrame, drop_cols: list[str], target_col:str, adjust_price: bool = False):
    new_df = df.copy()
    
    # TODO: If adjust price_price == True: Adjust sell price by inflation
    new_df = new_df.dropna()
    new_df = new_df.drop(drop_cols, axis = 1)
    new_df = new_df.drop_duplicates()
    if 'coordinate' in new_df.columns:
        new_df[['latitude', 'longitude']] = new_df['coordinate'].str.strip('[]').str.split(', ', expand=True)
        new_df = new_df.drop('coordinate', axis = 1)

    X = new_df.drop(target_col, axis = 1)
    y = new_df[target_col]
    
    return X, y

In [24]:
X, y = prepare_data(hemnet_data,
                    ['Unnamed: 0',
                     'address',
                     'pourcentage_difference',
                     'commune', 'price_per_area',
                     'sale_date'
                     ],
                    target_col = 'final_price')

In [25]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=309)

In [26]:
reg = MLPRegressor(random_state=309)
reg.fit(X_train, y_train)
y_pred = reg.predict(X_test)

In [27]:
display(f'R2 score: {r2_score(y_test, y_pred)}')

'R2 score: 0.9339899378252882'

In [44]:
param_grid = {
    'hidden_layer_sizes': [(50,), (100,), (200,), (300,)],
    'solver': ['lbfgs', 'sgd', 'adam'],
    'learning_rate': ['constant', 'invscaling', 'adaptive'],
    'learning_rate_init': [0.0005, 0.001, 0.002, 0.005],
    'max_iter': [200, 300],
    'shuffle': [True, False]
}
grid = GridSearchCV(estimator = MLPRegressor(random_state = 309),
                    param_grid = param_grid,
                    cv = 5,
                    n_jobs = -1,
                    scoring = 'r2')
grid.fit(X_train, y_train)

In [60]:
display(grid.best_params_)

{'hidden_layer_sizes': (100,),
 'learning_rate': 'constant',
 'learning_rate_init': 0.0005,
 'max_iter': 200,
 'shuffle': True,
 'solver': 'lbfgs'}

In [46]:
best_model = grid.best_estimator_
best_y_pred = best_model.predict(X_test)

In [61]:
print(f'MAE: {mean_absolute_error(y_test, best_y_pred)}')
print(f'sqrt(MSE): {sqrt(mean_squared_error(y_test, best_y_pred))}')
print(f'R2: {r2_score(y_test, best_y_pred)}')

MAE: 733193.3153801258
sqrt(MSE): 961709.2286110331
R2: 0.9334185196983958


In [93]:
fig = go.Figure((go.Scatter(x = y_test, y = y_test - best_y_pred, mode='markers'),
                 go.Scatter(x = y_test, y = [0 for _ in y_test], marker= dict(color = 'green')),
                 go.Scatter(x = y_test, y = [mean_absolute_error(y_test, best_y_pred) for _ in y_test], name = 'upper MAE', line = dict(color = 'red')),
                 go.Scatter(x = y_test, y = [-mean_absolute_error(y_test, best_y_pred) for _ in y_test], name = 'lower MAE', line = dict(color = 'red'))
                 ),
                 layout = go.Layout(title = '')
                )
fig.show()