# Data Cleaning

## Relevante pakker

In [86]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
import geopandas as gpd
pd.options.display.max_columns = None
# pd.set_option('display.max_rows', None)

## Indlæser data

In [203]:
bolig_df = pd.read_csv('boliger_salg.csv', sep = ';', encoding = 'utf-8')
print(len(bolig_df))

55265


  has_raised = await self.run_ast_nodes(code_ast.body, cell_name,


## Udvælger relevante variable

In [226]:
bolig_df.columns
bolig_df_selected = bolig_df[['id', 'latitude', 'longitude', 'propertyType', 'energyClass', 'price',
                             'rooms', 'size', 'lotSize', 'floor', 'buildYear', 'city', 'isForeclosure',
                             'municipality', 'zipCode', 'street', 'squaremeterPrice', 'area', 
                             'exp', 'basementSize', 'businessArea', 'cleanStreet', 'createdDate']]
print(len(bolig_df))

55265


## Udvælger observationer
1. Fjerner andelsboliger, fritidsboliger og fritidsgrunde
2. Vi har identificeret nogle observationer med meget lav pris, som viser sig at være timeshare, grunde i offentligt udbud mm. Vi fjerner derfor alle observationer med en pris på under 29.500
3. Fjerner dubletter


In [227]:
# 1. Fjerner andelsboliger, fritidshuse og fritidsgrunde
bolig_df_selected = bolig_df_selected[~bolig_df_selected['propertyType'].isin([4,5,8])].reset_index(drop = True)
bolig_df_selected
print(len(bolig_df[bolig_df['propertyType']==5]))
print(len(bolig_df[bolig_df['propertyType']==4]))
print(len(bolig_df[bolig_df['propertyType']==8]))
print(len(bolig_df_selected))
bolig_df_selected[bolig_df_selected['price']==10000]

723
6526
727
47289


Unnamed: 0,id,latitude,longitude,propertyType,energyClass,price,rooms,size,lotSize,floor,buildYear,city,isForeclosure,municipality,zipCode,street,squaremeterPrice,area,exp,basementSize,businessArea,cleanStreet,createdDate
13118,2065524,56.89526,9.22525,7,-,10000,0.0,0,703,,0,Ranum,False,820,9681,Nygade 10,0.0,11,0,0,,Nygade,2024-01-13T00:57:22.817Z
19117,2070806,56.8563,9.2659,7,-,10000,0.0,0,2667,,0,Løgstør,False,820,9670,"Bjørnsholmvej 177, Overlade",0.0,11,0,0,,Bjørnsholmvej,2024-02-06T00:57:01.653Z
20421,2065559,56.87703,9.28359,7,-,10000,0.0,0,756,,0,Løgstør,False,820,9670,"Munksjørupvej 64, Overlade",0.0,11,0,0,,Munksjørupvej,2024-01-13T00:57:27.373Z
22166,2065570,56.6904,9.33111,7,-,10000,0.0,0,398,,0,Gedsted,False,820,9631,Vesterbro 9,0.0,11,0,0,,Vesterbro,2024-01-13T00:57:28.497Z
23656,2065563,56.85478,9.26264,7,-,10000,0.0,0,1961,,0,Løgstør,False,820,9670,"Mosevej 30, Overlade",0.0,11,0,0,,Mosevej,2024-01-13T00:57:27.853Z
25787,2065561,56.89874,9.21992,7,-,10000,0.0,0,2106,,0,Ranum,False,820,9681,Stadionvej 4,0.0,11,0,0,,Stadionvej,2024-01-13T00:57:27.793Z
25788,2065571,56.67375,9.39907,7,-,10000,0.0,0,1266,,0,Aalestrup,False,820,9620,"Korsvejen 77, Fjelsø",0.0,11,0,0,,Korsvejen,2024-01-13T00:57:29.343Z
26697,2065569,56.71373,9.35613,7,-,10000,0.0,0,1143,,0,Gedsted,False,820,9631,"Tolshøj 20, Vesterbølle",0.0,11,0,0,,Tolshøj,2024-01-13T00:57:28.437Z
27338,2065560,56.90383,9.28472,7,-,10000,0.0,0,425,,0,Løgstør,False,820,9670,"Kirkebakken 7, Vilsted",0.0,11,0,0,,Kirkebakken,2024-01-13T00:57:27.753Z
31024,2065562,56.8967,9.22516,7,-,10000,0.0,0,614,,0,Ranum,False,820,9681,Vestergade 48,0.0,11,0,0,,Vestergade,2024-01-13T00:57:27.823Z


In [228]:
# 2. Fjerner boliger med pris under 29500
print(len(bolig_df_selected[bolig_df_selected['price']<=29500]))
bolig_df_selected = bolig_df_selected[bolig_df_selected['price']>29500]
print(len(bolig_df_selected))

19
47270


In [229]:
# Fjerner ejendomme med en en størrelse under 10 kvm som ikke er grunde, da det ligner erhvervsejendomme
# Filter the dataframe to remove rows where size is less than 10 and propertyType is not 7 or 8
print(len(bolig_df_selected[(bolig_df_selected['size']<10) & (~bolig_df_selected['propertyType'].isin([7]))]))
bolig_df_selected = bolig_df_selected.loc[
    (bolig_df_selected['size'] >= 10) | 
    (bolig_df_selected['propertyType'].isin([7, 8]))
]
print(len(bolig_df_selected))


40
47230


In [230]:
# Fjerner de to garager jeg har identificeret
# Løvholmen 14, st.. 10.
# Store Kongensgade 90, st.. 4.
bolig_df_selected = bolig_df_selected.loc[
    (~bolig_df_selected['street'].isin(["Løvholmen 14, st.. 10.", "Store Kongensgade 90, st.. 4."]))
]
print(len(bolig_df_selected))



47228


## Data Cleaning
Data cleaning refers to the process of identifying and correcting errors, inconsistencies, and missing values in a dataset. It is the process of making sure that the data is accurate and consistent before it is used for analysis. This includes tasks such as dealing with missing values, removing duplicate data, and correcting errors in the data. Data cleaning is an essential step in the process of working with data because it ensures that the data is of high quality and can be used to make accurate and reliable conclusions.

Nedenfor cleaner vi data ved at fjerne eventuelle dubletter og korrigere fejlværdier i variablene

## Dubletter 

### Tjekker for dubletter

In [231]:
# Tjekker for dubletter i alle kolonner
print("Dubletter:" + str(len(bolig_df[bolig_df.duplicated()])))

# Step 1: Group by 'street' and 'zipCode' and count occurrences
grouped_df = bolig_df_selected.groupby(['street', 'zipCode', 'propertyType']).size().reset_index(name='count')

# Step 2: Filter for groups where there is more than one occurrence
duplicates_df = grouped_df[grouped_df['count'] > 1]

# Step 3: Merge with original DataFrame to get all the original columns for the filtered rows
result_df = bolig_df_selected.merge(duplicates_df[['street', 'zipCode', 'propertyType']], 
                           on=['street', 'zipCode', 'propertyType'], how='inner')

pd.set_option('display.max_rows', 20)
result_df
result_df.to_csv('dubletter.csv', sep=';', index=False, encoding='utf-8')

# Der er nogle dubletter:
# 1. Villaer: Ligner fejl, nogle boliger ligger hos to mæglere. Hvis jeg kun kan finde den et sted 
# Er det den ældste som passer på det der ligger hos boliga, der er også nogle tilfælde, 
# Hvor nogle variable mangler på det nyeste men ikke den ældste række for ejendomme

# 2. Rækkehuse: Ligner også fejl. Nogle boliger ligger hos to mæglere. 
# Der er dog også et tilfælde af projektbolig, som ligger med samme adresse men to forskellige boliger
# i to forskellige størrelser. (kan evt. håndteres ved også at betinge på size, når dubletter fjernes)

# 3. Lejligheder: ligner fejl, der er nogle som ligger hos to mæglere. Igen er der en ejendom som ligger der
# 3 gange med samme adresse, men det er rækkehus eller noget og der er tre forskellige størrelser
# Kan håndteres ved at betinge på size, når dubletter fjernes

# 4. Fritidsejendom: Ligner også fejl. Igen nogle ejendomme på samme adresse med forskellig areal. 
# Kan håndteres ved at betinge på size når der fjernes dubletter
# I de tilfælde hvor jeg kun kan finde bolig hos en mægler ligner det at den ældste række passer.

# 6. Landejendom: Jeg tror mange af dem er reelle. Men det er svært. Det ligner at det er sælges en 
# ejendom og jord (dvs. flere matrikler). Jeg tror altid at de vil blive solgt sammen. 
# Jeg er i tvivl om den dyreste altid vil være sammenlignen af de to eller hvordan?
# Jeg tror vi skal beholde begge, hvilket kan gøres ved at betinge på lotSize, når vi 
# fjerner dubletter. Der er enkelte som ligger med samme lotSize, jeg tror det er ægte dubletter.

# 7. Helårsgrunde: Ligner igen fejl. Ligner at den ældste række har de rigtige informationer.

# 8. Fritidsgrunde: Ligner igen fejl. Ligner at den ældste række har de rigtige informationer.

Dubletter:0


### Fjerner dubletter

In [232]:
# Sort by 'createdDate' in ascending order to ensure the oldest observations are kept
bolig_df_selected = bolig_df_selected.sort_values(by='createdDate')

# Split the dataframe based on PropertyType
df_type1 = bolig_df_selected[bolig_df_selected['propertyType'] == 1]
df_type2 = bolig_df_selected[bolig_df_selected['propertyType'] == 2]
df_type3 = bolig_df_selected[bolig_df_selected['propertyType'] == 3]
# df_type4 = bolig_df_selected[bolig_df_selected['propertyType'] == 4]
df_type6 = bolig_df_selected[bolig_df_selected['propertyType'] == 6]
df_type7 = bolig_df_selected[bolig_df_selected['propertyType'] == 7]
# df_type8 = bolig_df_selected[bolig_df_selected['propertyType'] == 8]

In [233]:
# Drop duplicates for PropertyType 1
df_type1 = df_type1.drop_duplicates(subset=['street', 'zipCode'])

# Drop duplicates for PropertyType 2
df_type2 = df_type2.drop_duplicates(subset=['street', 'zipCode', 'size'])

# Drop duplicates for PropertyType 3
df_type3 = df_type3.drop_duplicates(subset=['street', 'zipCode', 'size'])

# Drop duplicates for PropertyType 6
df_type6 = df_type6.drop_duplicates(subset=['street', 'zipCode', 'lotSize'])

# Drop duplicates for PropertyType 7
df_type7 = df_type7.drop_duplicates(subset=['street', 'zipCode'])


In [234]:
# Concatenate the filtered dataframes back together
bolig_df_selected2 = pd.concat([df_type1, df_type2, df_type3, df_type6, df_type7])

# Reset index if needed
bolig_df_selected2.reset_index(drop=True, inplace=True)

print(len(bolig_df_selected2))

46859


In [189]:
bolig_df_selected2[bolig_df_selected2['street']=="Danehøje 16A"]
bolig_df_selected2[bolig_df_selected2['street']=="Langsand 218"]

Unnamed: 0,id,latitude,longitude,propertyType,priceChangePercentTotal,energyClass,openHouse,price,selfsale,rooms,size,lotSize,floor,buildYear,city,isForeclosure,isActive,municipality,zipCode,street,squaremeterPrice,area,daysForSale,createdDate,isPremiumAgent,images,net,exp,basementSize,inWatchlist,views,agentRegId,domainId,guid,agentDisplayName,groupKey,downPayment,itemType,dawaId,projectSaleUrl,additionalBuildings,lastSeen,businessArea,nonPremiumDiscrete,bfeNr,ouId,ouAddress,onTheWay,cleanStreet,otwAddress,dsAddress,boligaPlus,showLogo,randomTypeHuse


## EnergyClass
Sørger for at alt er med småt og retter mærkelige energiklasser

In [235]:
# Retter alt til stort
bolig_df_selected2['energyClass']= bolig_df_selected2['energyClass'].str.upper()

In [240]:
# Ændre mærkelige energimærker
# Define the mapping of old values to new values
mapping = {
    'M': 'A2',
    'K': 'A10',
    'J': 'A15',
    'I': 'A20'
}

# Replace the values in the 'energyClass' column
bolig_df_selected2['energyClass'] = bolig_df_selected2['energyClass'].replace(mapping)
bolig_df_selected2[bolig_df_selected2['energyClass']=='A20']

Unnamed: 0,id,latitude,longitude,propertyType,energyClass,price,rooms,size,lotSize,floor,buildYear,city,isForeclosure,municipality,zipCode,street,squaremeterPrice,area,exp,basementSize,businessArea,cleanStreet,createdDate
230,1709351,57.16998,9.74983,1,A20,3850000,5.0,182,924,,2021,Aabybro,False,849,9440,Solskins Alle 9,21153.0,11,2443,0,,Solskins Alle,2020-09-28T23:58:22.123Z
355,1773472,57.45257,9.98009,1,A20,9000000,7.0,374,1898,,2012,Hjørring,False,860,9800,Søndervang 5,24064.0,11,4765,114,0.0,Søndervang,2021-04-20T00:17:27.260Z
585,1837264,56.57827,9.00978,1,A20,3365000,5.0,160,1203,,2023,Skive,False,779,7800,"Vindevej 103, Vinde",21031.0,9,2157,0,,Vindevej,2021-10-15T00:05:14.840Z
596,1839481,56.57799,9.00993,1,A20,3695000,5.0,166,1203,,2023,Skive,False,779,7800,"Vindevej 101, Vinde",22259.0,9,2517,0,,Vindevej,2021-10-23T00:00:09.130Z
714,1860360,55.45981,9.10563,1,A20,2995000,4.0,129,339,,2024,Vejen,False,575,6600,"Kongstedgårds Alle 16F, Askov",23217.0,8,2425,0,0.0,Kongstedgårds Alle,2022-01-27T01:10:44.000Z
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
42622,2120539,55.80697,8.68127,6,A20,4498000,7.0,180,11000,,2008,Ølgod,False,573,6870,Agersnapvej 24,24988.0,8,2435,0,0.0,Agersnapvej,2024-06-27T00:06:22.563Z
42642,2121141,55.68951,11.79062,6,A20,17995000,4.0,354,84281,,1908,Holbæk,False,316,4300,"Bredetvedvej 23, Bredetved",50833.0,6,2969,0,0.0,Bredetvedvej,2024-06-28T22:05:25.000Z
42702,2123546,55.92217,12.17867,6,A20,39000000,10.0,383,233211,,2016,Skævinge,False,219,3320,Lindebjergvej 8,101827.0,3,11062,181,,Lindebjergvej,2024-07-05T22:04:31.310Z
42790,2126184,55.55734,12.11298,6,A20,10995000,8.0,162,97916,,1877,Havdrup,False,265,4622,Bulbrovej 1,67870.0,5,4383,0,0.0,Bulbrovej,2024-07-15T22:35:01.670Z


### Rooms
Sætter boliger med 0 værelser til missing, hvis det ikke er en grund

In [194]:
# Set 'rooms' to NaN where 'rooms' == 0 and 'propertyType' is not 7 or 8
bolig_df_selected2.loc[(bolig_df_selected2['rooms'] == 0) & 
                      (~bolig_df_selected2['propertyType'].isin([7, 8])), 'rooms'] = np.nan

### Size
Overvej:
1. Skal vi fjerne ejendomme med size under 10 kvm, som ikke er grunda, da det ligner erhvervsejendomme?
2. Skal vi fjerne de to garager jeg har identificeret?

### lotSize
Overvej:
1. Sæt lotSize til missing, hvis den er 0 for lejligheder?
2. Skal vi gøre noget ved høj lotSize for fritidsejendomme?

In [197]:
# Sætter lotSize til missing, hvis den er 0 eller 1  og det ikke er en lejlighed
bolig_df_selected2.loc[
    (bolig_df_selected2['lotSize'].isin([0,1])) & 
    (bolig_df_selected2['propertyType'] != 3), 'lotSize'
] = np.nan

bolig_df_selected2[bolig_df_selected2['propertyType']!=3].sort_values('lotSize')

Unnamed: 0,id,latitude,longitude,propertyType,priceChangePercentTotal,energyClass,openHouse,price,selfsale,rooms,size,lotSize,floor,buildYear,city,isForeclosure,isActive,municipality,zipCode,street,squaremeterPrice,area,daysForSale,createdDate,isPremiumAgent,images,net,exp,basementSize,inWatchlist,views,agentRegId,domainId,guid,agentDisplayName,groupKey,downPayment,itemType,dawaId,projectSaleUrl,additionalBuildings,lastSeen,businessArea,nonPremiumDiscrete,bfeNr,ouId,ouAddress,onTheWay,cleanStreet,otwAddress,dsAddress,boligaPlus,showLogo,randomTypeHuse
31127,2059729,55.26901,9.89621,2,0,C,,695000,False,3.0,60,43.0,,1830,Assens,False,True,420,5610,Kindhestegade 16,11583.0,7,252,2023-12-08T09:39:28.550Z,False,,0,842,0,False,203,575,3,25BA10E3-C167-40AD-8755-7A221CA19D6D,,,35000,0,0A3F50B2-48C4-32B8-E044-0003BA298018,,,2024-08-15T23:50:04.233Z,0.0,False,5440156.0,1283406553,kindhestegade-16-5610-assens,False,Kindhestegade,,,False,False,"{'leisureHouses': [], 'houses': []}"
30651,1881317,55.26896,9.89483,2,-8,D,2024-08-17T10:00:00.000Z,545000,False,3.0,63,45.0,,1800,Assens,False,True,420,5610,Adelgade 15,8650.0,7,864,2022-04-05T00:05:47.383Z,False,,0,1212,11,False,966,484,7,49EB2FF4-A3A4-4070-9347-4AED3B7A73D0,,,30000,0,0A3F50B2-406E-32B8-E044-0003BA298018,,,2024-08-15T23:53:53.223Z,0.0,False,5440235.0,1964460645,adelgade-15-5610-assens,False,Adelgade,,,False,False,"{'leisureHouses': [], 'houses': []}"
31442,2078184,56.46053,10.03292,2,0,E,,1395000,False,3.0,95,45.0,,1900,Randers C,False,True,730,8900,Von Hattenstræde 5,14684.0,10,165,2024-03-04T10:31:57.990Z,True,"[{'id': 2078184, 'date': '2024-08-16T15:49:09....",0,1686,0,False,299,901,9,3936EF14-9E5A-49E0-A585-42179D530A13,"Nybolig Randers, Bjørn & Ankersen",,70000,0,0A3F50C1-5185-32B8-E044-0003BA298018,,,2024-08-15T23:41:07.970Z,0.0,False,5608051.0,1085138353,von-hattenstraede-5-8900-randers-c,False,Von Hattenstræde,,,False,False,"{'leisureHouses': [], 'houses': []}"
32587,2113016,55.86315,9.84703,2,-6,C,2024-08-17T10:00:00.000Z,1495000,False,3.0,74,47.0,,1880,Horsens,False,True,615,8700,Grønnegade 21,20202.0,10,68,2024-06-09T00:29:33.000Z,False,,0,1251,0,False,77,695,7,0D000E49-0B26-4481-BA9A-DB60E7C49E33,,,75000,0,,,,2024-08-15T23:38:14.527Z,0.0,False,5638835.0,36972962,groennegade-21-8700-horsens,False,Grønnegade,,,False,False,"{'leisureHouses': [], 'houses': []}"
32681,2115321,56.08740,8.24349,2,0,C,,995000,False,2.0,41,47.0,,1760,Ringkøbing,False,True,760,6950,Ø Strandgade 43,24268.0,9,62,2024-06-15T00:18:40.340Z,False,"[{'id': 2115321, 'date': '2024-08-16T16:12:10....",0,1223,0,False,78,845,7,464BD44D-B3CB-43B0-94F3-7B43EE80B496,,,50000,0,,,,2024-08-15T23:37:34.187Z,0.0,False,5724866.0,1994160652,oe-strandgade-43-6950-ringkoebing,False,Ø Strandgade,,,False,False,"{'leisureHouses': [], 'houses': []}"
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
45978,2066606,55.24078,10.45256,7,0,-,,925000,False,0.0,0,,,0,Ringe,False,True,430,5750,Frøvej 4,0.0,7,211,2024-01-18T10:37:29.263Z,False,"[{'id': 2066606, 'date': '2024-08-16T16:18:39....",0,524,0,False,49,25895,9,,,,50000,0,C32F1B0F-D8CF-4C26-AC5E-12DEBD261542,,,2024-08-15T23:42:47.383Z,,False,100548330.0,348404144,froevej-4-5750-ringe,False,Frøvej,,,False,False,"{'leisureHouses': [], 'houses': []}"
45979,2066717,55.24106,10.45346,7,0,-,,775000,False,0.0,0,,,0,Ringe,False,True,430,5750,Salamandervej 15,0.0,7,211,2024-01-18T14:30:18.193Z,False,"[{'id': 2066717, 'date': '2024-08-16T16:19:07....",0,439,0,False,47,25895,9,,,,40000,0,10C17E62-4F83-47F9-AC79-79A203BB9F6B,,,2024-08-15T23:42:47.450Z,,False,100548346.0,871798393,salamandervej-15-5750-ringe,False,Salamandervej,,,False,False,"{'leisureHouses': [], 'houses': []}"
45980,2066718,55.24084,10.45344,7,0,-,,775000,False,0.0,0,,,0,Ringe,False,True,430,5750,Salamandervej 13,0.0,7,211,2024-01-18T14:30:24.917Z,False,"[{'id': 2066718, 'date': '2024-08-16T16:18:11....",0,439,0,False,51,25895,9,,,,40000,0,91AA8A61-27BC-4EB1-9016-9E06D2C74356,,,2024-08-15T23:42:47.433Z,,False,100548347.0,680880300,salamandervej-13-5750-ringe,False,Salamandervej,,,False,False,"{'leisureHouses': [], 'houses': []}"
45981,2066722,55.24086,10.45215,7,0,-,,925000,False,0.0,0,,,0,Ringe,False,True,430,5750,"Frøvej 6, Rynkeby",0.0,7,211,2024-01-18T14:30:41.117Z,False,"[{'id': 2066722, 'date': '2024-08-16T16:23:14....",0,524,0,False,31,25895,9,,,,50000,0,7762D26E-3518-45B7-B41B-F88595E5F80D,,,2024-08-15T23:42:47.403Z,,False,100548331.0,1675292128,froevej-6-5750-ringe,False,Frøvej,,,False,False,"{'leisureHouses': [], 'houses': []}"


### buildYear

In [198]:
# Sætter byggeår til missing, hvis den er fra før 1575
bolig_df_selected2.loc[(bolig_df_selected2['buildYear'] < 1575), 'buildYear'] = np.nan
bolig_df_selected2[(bolig_df_selected2['buildYear'] < 1575)]

Unnamed: 0,id,latitude,longitude,propertyType,priceChangePercentTotal,energyClass,openHouse,price,selfsale,rooms,size,lotSize,floor,buildYear,city,isForeclosure,isActive,municipality,zipCode,street,squaremeterPrice,area,daysForSale,createdDate,isPremiumAgent,images,net,exp,basementSize,inWatchlist,views,agentRegId,domainId,guid,agentDisplayName,groupKey,downPayment,itemType,dawaId,projectSaleUrl,additionalBuildings,lastSeen,businessArea,nonPremiumDiscrete,bfeNr,ouId,ouAddress,onTheWay,cleanStreet,otwAddress,dsAddress,boligaPlus,showLogo,randomTypeHuse


In [199]:
# Sætter byggeår til missing, hvis det er en grund
bolig_df_selected2.loc[(bolig_df_selected2['propertyType'].isin([7,8]), 'buildYear')] = np.nan
bolig_df_selected2[(~bolig_df_selected2['buildYear'].isna()) & (bolig_df_selected2['propertyType'].isin([7,8]))]

Unnamed: 0,id,latitude,longitude,propertyType,priceChangePercentTotal,energyClass,openHouse,price,selfsale,rooms,size,lotSize,floor,buildYear,city,isForeclosure,isActive,municipality,zipCode,street,squaremeterPrice,area,daysForSale,createdDate,isPremiumAgent,images,net,exp,basementSize,inWatchlist,views,agentRegId,domainId,guid,agentDisplayName,groupKey,downPayment,itemType,dawaId,projectSaleUrl,additionalBuildings,lastSeen,businessArea,nonPremiumDiscrete,bfeNr,ouId,ouAddress,onTheWay,cleanStreet,otwAddress,dsAddress,boligaPlus,showLogo,randomTypeHuse


### PropertyType
1. Lægger 10, 11 og 12 sammen til andet

In [241]:
# Replace propertyType 11 and 12 with 10
bolig_df_selected2['propertyType'] = bolig_df_selected2['propertyType'].replace([11, 12], 10)


### Municipality
1. Sætter kommune til missing, hvis den er 0 eller -1

In [201]:
bolig_df_selected2.loc[(bolig_df_selected2['municipality'].isin([0,-1])),
                       'municipality'] = np.nan
bolig_df_selected2.groupby('municipality').size()


municipality
101.0    1681
147.0     239
151.0     140
153.0      97
155.0     127
         ... 
840.0     497
846.0     663
849.0     696
851.0    1945
860.0    1032
Length: 98, dtype: int64

### Coordinates
Retter missing koordinater

In [258]:
# Sætter koordinater med 0 eller -1 til missing
import numpy as np

# Set 'longitude' to NaN where it is equal to -1 or 0
bolig_df_selected2.loc[
    (bolig_df_selected2['longitude'] == -1) | (bolig_df_selected2['longitude'] == 0), 
    'longitude'
] = np.nan

# Set 'latitude' to NaN where it is equal to -1 or 0
bolig_df_selected2.loc[
    (bolig_df_selected2['latitude'] == -1) | (bolig_df_selected2['latitude'] == 0), 
    'latitude'
] = np.nan

# Eksporterer data
Eksporterer cleaned data

In [259]:
bolig_df_selected2.to_csv('boliger_cleaned.csv', sep=';', index=False, encoding='utf-8')