## Cleaning of the imported data for the real estate EDA project

In [163]:
# import all libraries which can be helpful down the road
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.express as px
import altair as alt
import missingno as msno

##### Data needs to be imported to work with it. Then have a first look at the dataframe and the data itself.

In [164]:
# read data into a daframe, look at the first five rows to get an impression
df = pd.read_csv('data/eda.csv')
df.head()

Unnamed: 0,id,bedrooms,bathrooms,sqft_living,sqft_lot,floors,waterfront,view,condition,grade,...,yr_renovated,zipcode,lat,long,sqft_living15,sqft_lot15,date,price,house_id,id.1
0,7129300520,3.0,1.0,1180.0,5650.0,1.0,,0.0,3,7,...,0.0,98178,47.5112,-122.257,1340.0,5650.0,2014-10-13,221900.0,7129300520,1
1,6414100192,3.0,2.25,2570.0,7242.0,2.0,0.0,0.0,3,7,...,19910.0,98125,47.721,-122.319,1690.0,7639.0,2014-12-09,538000.0,6414100192,2
2,5631500400,2.0,1.0,770.0,10000.0,1.0,0.0,0.0,3,6,...,,98028,47.7379,-122.233,2720.0,8062.0,2015-02-25,180000.0,5631500400,3
3,2487200875,4.0,3.0,1960.0,5000.0,1.0,0.0,0.0,5,7,...,0.0,98136,47.5208,-122.393,1360.0,5000.0,2014-12-09,604000.0,2487200875,4
4,1954400510,3.0,2.0,1680.0,8080.0,1.0,0.0,0.0,3,8,...,0.0,98074,47.6168,-122.045,1800.0,7503.0,2015-02-18,510000.0,1954400510,5


In [165]:
# check types of columns, have a look if all columns posess values
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 21597 entries, 0 to 21596
Data columns (total 23 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   id             21597 non-null  int64  
 1   bedrooms       21597 non-null  float64
 2   bathrooms      21597 non-null  float64
 3   sqft_living    21597 non-null  float64
 4   sqft_lot       21597 non-null  float64
 5   floors         21597 non-null  float64
 6   waterfront     19206 non-null  float64
 7   view           21534 non-null  float64
 8   condition      21597 non-null  int64  
 9   grade          21597 non-null  int64  
 10  sqft_above     21597 non-null  float64
 11  sqft_basement  21145 non-null  float64
 12  yr_built       21597 non-null  int64  
 13  yr_renovated   17749 non-null  float64
 14  zipcode        21597 non-null  int64  
 15  lat            21597 non-null  float64
 16  long           21597 non-null  float64
 17  sqft_living15  21597 non-null  float64
 18  sqft_l

##### Some redundancies can be eliminated and some solumns can be renamed for a clearer description.

In [166]:
# house_id is the same as id, therefore the column can be dropped.
df.drop(columns=['house_id'], inplace=True)

In [167]:
# id.1 is a bad column name and it needs a better description anyways, rename id to prop(erty)_id to make it more obvious
df.rename(columns={"id.1": "transaction_id", "id": "prop_id"}, inplace=True)

In [168]:
# just have a look at the data
df.describe()

Unnamed: 0,prop_id,bedrooms,bathrooms,sqft_living,sqft_lot,floors,waterfront,view,condition,grade,...,sqft_basement,yr_built,yr_renovated,zipcode,lat,long,sqft_living15,sqft_lot15,price,transaction_id
count,21597.0,21597.0,21597.0,21597.0,21597.0,21597.0,19206.0,21534.0,21597.0,21597.0,...,21145.0,21597.0,17749.0,21597.0,21597.0,21597.0,21597.0,21597.0,21597.0,21597.0
mean,4580474000.0,3.3732,2.115826,2080.32185,15099.41,1.494096,0.007602,0.233863,3.409825,7.657915,...,291.857224,1970.999676,836.650516,98077.951845,47.560093,-122.213983,1986.620318,12758.283512,540296.6,10799.0
std,2876736000.0,0.926299,0.768984,918.106125,41412.64,0.539683,0.086858,0.765686,0.650546,1.1732,...,442.490863,29.375234,4000.110554,53.513072,0.138552,0.140724,685.230472,27274.44195,367368.1,6234.661218
min,1000102.0,1.0,0.5,370.0,520.0,1.0,0.0,0.0,1.0,3.0,...,0.0,1900.0,0.0,98001.0,47.1559,-122.519,399.0,651.0,78000.0,1.0
25%,2123049000.0,3.0,1.75,1430.0,5040.0,1.0,0.0,0.0,3.0,7.0,...,0.0,1951.0,0.0,98033.0,47.4711,-122.328,1490.0,5100.0,322000.0,5400.0
50%,3904930000.0,3.0,2.25,1910.0,7618.0,1.5,0.0,0.0,3.0,7.0,...,0.0,1975.0,0.0,98065.0,47.5718,-122.231,1840.0,7620.0,450000.0,10799.0
75%,7308900000.0,4.0,2.5,2550.0,10685.0,2.0,0.0,0.0,4.0,8.0,...,560.0,1997.0,0.0,98118.0,47.678,-122.125,2360.0,10083.0,645000.0,16198.0
max,9900000000.0,33.0,8.0,13540.0,1651359.0,3.5,1.0,4.0,5.0,13.0,...,4820.0,2015.0,20150.0,98199.0,47.7776,-121.315,6210.0,871200.0,7700000.0,21597.0


##### The data type of some columns can be changed, because there is no need to have floats as the datatype "everywhere". Additionally, NAs are dealt with in this part.
The removal of NAs and "0" values is done with consideration of the client. This procedure would probably vary if dealing with another client.  
For example, the date was given as a string and transformed into the pandas date format to make work easier.

In [169]:
# some columns are floats and this is not necessary - that's why I set them to integers
# three columns have NAs in them: if there is no basement, it cannot have an area (NA there --> 0 sqft). 
# If there is no value in the "view" column, I consider it as having no/(a bad) view -> set it to 0. If a property would have a good view, it would be noted for sure to increase the grade/value.
# if there is no value in the "waterfront" column, I consider it being a no water front lot --> set it to 0. If a property would have access to the water, it would highly likely be noted to increase the grade/value.

df[["sqft_living", "sqft_lot", "sqft_above", "sqft_living15", "sqft_lot15", "price"]] = df[["sqft_living", "sqft_lot", "sqft_above", "sqft_living15", "sqft_lot15", "price"]].astype(int)
df[["waterfront", "view", "sqft_basement"]] =  df[["waterfront", "view", "sqft_basement"]].fillna(0.0).astype(int)

In [170]:
# transforming the date column from a string to a date format
df['date'] = pd.to_datetime(df['date'], format='%Y-%m-%d')

In [171]:
# the yr_renovated column had an error in it. All years were inflated by factor 10. So I devided by 10 and then set the format to integer.
# All values with the value "0" were set to NA, because it's easier for plots and sorting. This datatype is not transferred, if you save the data in a csv-file. 
# You have to set it again after importing the data! It is a datatype used especially by pandas.

df["yr_renovated"] = df["yr_renovated"] / 10.0
df.loc[(df.yr_renovated == 0), 'yr_renovated'] = np.nan
df['yr_renovated'] = df['yr_renovated'].astype(pd.Int64Dtype())

In [158]:
# having a look at the dataframe after the first round of cleaning up the data.
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 21597 entries, 0 to 21596
Data columns (total 22 columns):
 #   Column          Non-Null Count  Dtype         
---  ------          --------------  -----         
 0   prop_id         21597 non-null  int64         
 1   bedrooms        21597 non-null  float64       
 2   bathrooms       21597 non-null  float64       
 3   sqft_living     21597 non-null  int64         
 4   sqft_lot        21597 non-null  int64         
 5   floors          21597 non-null  float64       
 6   waterfront      21597 non-null  int64         
 7   view            21597 non-null  int64         
 8   condition       21597 non-null  int64         
 9   grade           21597 non-null  int64         
 10  sqft_above      21597 non-null  int64         
 11  sqft_basement   21597 non-null  int64         
 12  yr_built        21597 non-null  int64         
 13  yr_renovated    744 non-null    Int64         
 14  zipcode         21597 non-null  int64         
 15  la

In [172]:
df.head()

Unnamed: 0,prop_id,bedrooms,bathrooms,sqft_living,sqft_lot,floors,waterfront,view,condition,grade,...,yr_built,yr_renovated,zipcode,lat,long,sqft_living15,sqft_lot15,date,price,transaction_id
0,7129300520,3.0,1.0,1180,5650,1.0,0,0,3,7,...,1955,,98178,47.5112,-122.257,1340,5650,2014-10-13,221900,1
1,6414100192,3.0,2.25,2570,7242,2.0,0,0,3,7,...,1951,1991.0,98125,47.721,-122.319,1690,7639,2014-12-09,538000,2
2,5631500400,2.0,1.0,770,10000,1.0,0,0,3,6,...,1933,,98028,47.7379,-122.233,2720,8062,2015-02-25,180000,3
3,2487200875,4.0,3.0,1960,5000,1.0,0,0,5,7,...,1965,,98136,47.5208,-122.393,1360,5000,2014-12-09,604000,4
4,1954400510,3.0,2.0,1680,8080,1.0,0,0,3,8,...,1987,,98074,47.6168,-122.045,1800,7503,2015-02-18,510000,5


##### After the cleaning of the data, in this case dropping columns, renaming columns, changing datatypes, dealing with NA and "0" values, the data is exported to a different csv-file. Please keep in mind, that this cannot be done everytime, because of space constraints for example.

In [173]:
# save cleaned date(frame) to a new csv-file
df.to_csv('data/cleaned_realestate.csv', sep=';', index=False)

##### So far the data has not been heavily edited or transformed. There has been no look at outliers or strange data points.   
##### Only the yr_renovated column was edited to gain sensible data for the year. The data was not created in the future ;)