In [9]:
from sklearn.preprocessing import StandardScaler
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import r2_score, mean_squared_error
import pickle
import os

cities = ['2800', '2820', '2830', '2840', '2850', '2900', '2920', '2930', '2942', '2950','3000','3460']
energy_classes = ['A2020', 'A2015', 'A2010', 'B', 'C', 'D', 'E', 'F', 'G']
types = ['Villa','Ejerlejlighed','Rækkehus','Villalejlighed']

dataframes = []

for city in cities:
    filename = f'./data/house_data/house_data_{city}.csv'
    df = pd.read_csv(filename)
    df['City'] = city  # Add a 'City' column with the zip code
    dataframes.append(df)

combined_df = pd.concat(dataframes, ignore_index=True)  

# Fjern rækker med boligtypen "Landejendom"
combined_df = combined_df[combined_df['Type'] != 'Landejendom']

# Fjern rækker med NaN-værdier
combined_df = combined_df.dropna()

combined_df = pd.get_dummies(combined_df, columns=['City', 'Energy class', 'Type'])

scaler = StandardScaler()
combined_df['Size'] = scaler.fit_transform(combined_df[['Size']])

# Opdel data i features (X) og target (y)
X = combined_df.drop(['Price','Squaremeter price', 'Address','Url'], axis=1)
y = combined_df['Price']

# Opdel data i trænings- og testsæt
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Lav og træn modellen
linear_model = LinearRegression()
linear_model.fit(X_train, y_train)

# Test modellen
predictions = linear_model.predict(X_test)
print('R^2 score: ', r2_score(y_test, predictions))
print('RMSE: ', mean_squared_error(y_test, predictions, squared=False))

# Angiv filstien til den eksisterende model
model_filename = 'linear_model.pkl'

# Tjek om filen eksisterer, og slet den hvis den gør det
if os.path.exists(model_filename):
    os.remove(model_filename)

# Gemme modellen
with open('linear_model.pkl', 'wb') as file:
    pickle.dump(linear_model, file)




R^2 score:  0.7470507602966174
RMSE:  3078310.67965753
