In [32]:
# # Kombinere dataframes til en dataframe og fjerne NaN værdier

# import pandas as pd
# import matplotlib.pyplot as plt
# from sklearn.cluster import KMeans
# from sklearn.metrics import silhouette_score

# cities = ['2800', '2820', '2830', '2840', '2850', '2900', '2920', '2930', '2942', '2950', '3000', '3460']

# dataframes = []

# for city in cities:
#     filename = f'./data/house_data/house_data_{city}.csv'
#     df = pd.read_csv(filename)
#     dataframes.append(df)

# combined_df = pd.concat(dataframes, ignore_index=True)    

# # Vis hele dataframe uden begrænsninger
# pd.set_option("display.max_rows", None)  # Vis alle rækker

# # Fjern rækker med NaN-værdier
# combined_df = combined_df.dropna()

# # Nulstil indekset i den opdaterede dataframe
# combined_df = combined_df.reset_index(drop=True)

# #combined_df

In [72]:
from sklearn.preprocessing import StandardScaler
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import r2_score, mean_squared_error
import pickle

cities = ['2800', '2820', '2830', '2840', '2850', '2900', '2920', '2930', '2942', '2950','3000','3460']
energy_classes = ['A2020', 'A2015', 'A2010', 'B', 'C', 'D', 'E', 'F', 'G']
types = ['Villa','Ejerlejlighed','Rækkehus','Villalejlighed']

dataframes = []

for city in cities:
    filename = f'./data/house_data/house_data_{city}.csv'
    df = pd.read_csv(filename)
    df['City'] = city  # Add a 'City' column with the zip code
    dataframes.append(df)

combined_df = pd.concat(dataframes, ignore_index=True)  

# Fjern rækker med boligtypen "Landejendom"
combined_df = combined_df[combined_df['Type'] != 'Landejendom']

# Fjern rækker med NaN-værdier
combined_df = combined_df.dropna()

combined_df = pd.get_dummies(combined_df, columns=['City', 'Energy class', 'Type'])

scaler = StandardScaler()
combined_df['Size'] = scaler.fit_transform(combined_df[['Size']])

# Opdel data i features (X) og target (y)
X = combined_df.drop(['Price','Squaremeter price', 'Address','Url'], axis=1)
y = combined_df['Price']

# Opdel data i trænings- og testsæt
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


# Lav og træn modellen
linear_model = LinearRegression()
linear_model.fit(X_train, y_train)

# Test modellen
predictions = linear_model.predict(X_test)
print('R^2 score: ', r2_score(y_test, predictions))
print('RMSE: ', mean_squared_error(y_test, predictions, squared=False))

# Gemme modellen
with open('linear_model.pkl', 'wb') as file:
    pickle.dump(linear_model, file)

import pandas as pd

linenumber = 0

# Opret et dictionary med værdierne til de nye data
new_data = {
    'X': combined_df.loc[linenumber, 'X'],
    'Y': combined_df.loc[linenumber, 'Y'],
    'Size': combined_df.loc[linenumber, 'Size'],
    'City_2800': combined_df.loc[linenumber, 'City_2800'],
    'City_2820': combined_df.loc[linenumber, 'City_2820'],
    'City_2830': combined_df.loc[linenumber, 'City_2830'],
    'City_2840': combined_df.loc[linenumber, 'City_2840'],
    'City_2850': combined_df.loc[linenumber, 'City_2850'],
    'City_2900': combined_df.loc[linenumber, 'City_2900'],
    'City_2920': combined_df.loc[linenumber, 'City_2920'],
    'City_2930': combined_df.loc[linenumber, 'City_2930'],
    'City_2942': combined_df.loc[linenumber, 'City_2942'],
    'City_2950': combined_df.loc[linenumber, 'City_2950'],
    'City_3000': combined_df.loc[linenumber, 'City_3000'],
    'City_3460': combined_df.loc[linenumber, 'City_3460'],
    'Energy class_A2010': combined_df.loc[linenumber, 'Energy class_A2020'],
    'Energy class_A2015': combined_df.loc[linenumber, 'Energy class_A2015'],
    'Energy class_A2020': combined_df.loc[linenumber, 'Energy class_A2010'],
    'Energy class_B': combined_df.loc[linenumber, 'Energy class_B'],
    'Energy class_C': combined_df.loc[linenumber, 'Energy class_C'],
    'Energy class_D': combined_df.loc[linenumber, 'Energy class_D'],
    'Energy class_E': combined_df.loc[linenumber, 'Energy class_E'],
    'Energy class_F': combined_df.loc[linenumber, 'Energy class_F'],
    'Energy class_G': combined_df.loc[linenumber, 'Energy class_G'],
    'Type_Ejerlejlighed': combined_df.loc[linenumber, 'Type_Ejerlejlighed'],
    'Type_Rækkehus': combined_df.loc[linenumber, 'Type_Rækkehus'],
    'Type_Villa': combined_df.loc[linenumber, 'Type_Villa'],
    'Type_Villalejlighed': combined_df.loc[linenumber, 'Type_Villalejlighed'],
}

# Opret en ny DataFrame med de nye data
new_df = pd.DataFrame([new_data])

# Forudsig prisen på den nye bolig
prediction = int(linear_model.predict(new_df))

print("Forudset pris:", prediction)
print(new_data)


combined_df


R^2 score:  0.6552805218638091
RMSE:  4237123.57182247
Forudset pris: 658928
{'X': 55.770647, 'Y': 12.508193, 'Size': -1.1497078841314659, 'City_2800': True, 'City_2820': False, 'City_2830': False, 'City_2840': False, 'City_2850': False, 'City_2900': False, 'City_2920': False, 'City_2930': False, 'City_2942': False, 'City_2950': False, 'City_3000': False, 'City_3460': False, 'Energy class_A2010': False, 'Energy class_A2015': False, 'Energy class_A2020': False, 'Energy class_B': False, 'Energy class_C': False, 'Energy class_D': False, 'Energy class_E': True, 'Energy class_F': False, 'Energy class_G': False, 'Type_Ejerlejlighed': True, 'Type_Rækkehus': False, 'Type_Villa': False, 'Type_Villalejlighed': False}
X: -38360724.323006414
Y: 18448763.61336527
Size: 3009857.716073675
City_2800: -2648399.463881613
City_2820: -3046489.1538511915
City_2830: -1180310.1057092939
City_2840: 515977.4642876531
City_2850: -2722607.5205603377
City_2900: -665981.6495856005
City_2920: -315164.352560666
City

Unnamed: 0,Address,X,Y,Price,Size,Squaremeter price,Url,City_2800,City_2820,City_2830,...,Energy class_B,Energy class_C,Energy class_D,Energy class_E,Energy class_F,Energy class_G,Type_Ejerlejlighed,Type_Rækkehus,Type_Villa,Type_Villalejlighed
0,"Nørgaardsvej 27, 2. th",55.770647,12.508193,1595000,-1.149708,36250,https://www.dingeo.dk/adresse/2800-kongens%20l...,True,False,False,...,False,False,False,True,False,False,True,False,False,False
1,Gammelmosevej 218,55.756103,12.480264,4495000,-0.352875,39778,https://www.dingeo.dk/adresse/2800-kongens%20l...,True,False,False,...,False,True,False,False,False,False,False,True,False,False
2,Danmarksvej 43A,55.794678,12.517936,7695000,0.443957,42280,https://www.dingeo.dk/adresse/2800-kongens%20l...,True,False,False,...,False,False,True,False,False,False,False,False,True,False
3,Stengårdsvænge 103,55.759154,12.482144,5295000,-0.479907,51911,https://www.dingeo.dk/adresse/2800-kongens%20l...,True,False,False,...,False,True,False,False,False,False,False,True,False,False
4,Buddingevej 110,55.758719,12.494712,8750000,0.513247,46542,https://www.dingeo.dk/adresse/2800-kongens%20l...,True,False,False,...,False,False,True,False,False,False,False,False,True,False
5,Gammel Lundtoftevej 13,55.775416,12.501682,3995000,-0.757066,51217,https://www.dingeo.dk/adresse/2800-kongens%20l...,True,False,False,...,False,False,False,True,False,False,False,False,True,False
6,"Hollandsvej 13, st. th",55.764643,12.50608,2150000,-1.022677,39090,https://www.dingeo.dk/adresse/2800-kongens%20l...,True,False,False,...,False,False,True,False,False,False,True,False,False,False
7,"Lyngby Hovedgade 11B, 3. th",55.772716,12.501409,2595000,-0.988032,44741,https://www.dingeo.dk/adresse/2800-kongens%20l...,True,False,False,...,False,False,True,False,False,False,True,False,False,False
8,Islandsvej 4A,55.796196,12.508581,6195000,-0.606938,68076,https://www.dingeo.dk/adresse/2800-kongens%20l...,True,False,False,...,False,True,False,False,False,False,False,True,False,False
9,Vintappervej 16,55.763259,12.51509,5500000,0.062863,36912,https://www.dingeo.dk/adresse/2800-kongens%20l...,True,False,False,...,False,True,False,False,False,False,False,False,True,False
