# Regression Party - Neural Network
Denna notebook täcker en analys av datan i hemnet_data_clean.csv samt en regression med hjälp av maskinlärning.

In [1]:
import pandas as pd
from sklearn.metrics import r2_score
from sklearn.model_selection import GridSearchCV, train_test_split
from sklearn.neural_network import MLPRegressor

In [2]:
hemnet_data = pd.read_csv('hemnet_data_clean.csv')

In [3]:
def prepare_data(df: pd.DataFrame, drop_cols: list[str], target_col:str, adjust_price: bool = False):
    new_df = df.copy()
    
    # TODO: If adjust price_price == True: Adjust sell price by inflation
    new_df = new_df.dropna()
    new_df = new_df.drop(drop_cols, axis = 1)
    new_df = new_df.drop_duplicates()
    if 'coordinate' in new_df.columns:
        new_df[['latitude', 'longitude']] = new_df['coordinate'].str.strip('[]').str.split(', ', expand=True)
        new_df = new_df.drop('coordinate', axis = 1)

    X = new_df.drop(target_col, axis = 1)
    y = new_df[target_col]
    
    return X, y

In [4]:
X, y = prepare_data(hemnet_data,
                    ['Unnamed: 0',
                     'address',
                     'pourcentage_difference',
                     'commune', 'price_per_area',
                     'sale_date'
                     ],
                    target_col = 'final_price')

In [5]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=309)

In [6]:
reg = MLPRegressor(random_state=309)
reg.fit(X_train, y_train)
y_pred = reg.predict(X_test)

In [7]:
display(f'R2 score: {r2_score(y_test, y_pred)}')

'R2 score: 0.9339899378252882'

In [8]:
param_grid = {
    'hidden_layer_sizes': [(100,), (200,), (300,)],
    'solver': ['lbfgs', 'sgd', 'adam'],
    'learning_rate': ['constant', 'invscaling', 'adaptive'],
    'learning_rate_init': [0.001, 0.002, 0.005],
    'max_iter': [200, 100, 300],
    'shuffle': [True, False]
}
grid = GridSearchCV(estimator = MLPRegressor(random_state = 309),
                    param_grid = param_grid,
                    cv = 5,
                    n_jobs = -1,
                    scoring = 'r2')
grid.fit(X_train, y_train)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
  self.n_iter_ = _check_optimize_result("lbfgs", opt_res, self.max_iter)


In [9]:
display(grid.best_params_)

{'hidden_layer_sizes': (100,),
 'learning_rate': 'constant',
 'learning_rate_init': 0.001,
 'max_iter': 100,
 'shuffle': True,
 'solver': 'lbfgs'}

In [10]:
best_model = grid.best_estimator_
best_y_pred = best_model.predict(X_test)

In [11]:
display(f'R2 score: {r2_score(y_test, best_y_pred)}')

'R2 score: 0.9337010228238671'