In [None]:
import numpy as np
import pandas as pd
import plotly.express as px

# Data Cleaning & Missing Values

### Load the Data

In [None]:
#df = pd.read_csv('berlin_housing_with_scraped.csv')
df = pd.read_csv('https://raw.githubusercontent.com/ReDI-School/data-analytics/master/Class%20material/Class-11_Data_cleaning/berlin_housing_with_scraped.csv')

### Check the Data's `shape`,  `head`, and the `dtypes`

In [None]:
df.shape

In [None]:
df.head(3)

In [None]:
df.dtypes

### First Question: Is there missing data? If yes, how much?

In [None]:
(df.isnull().sum() / len(df)).sort_values(ascending = False)

## Clean `tags`

Have a look at the tags columns. What does this information tell us?

In [None]:
df.tags[0]

In [None]:
df.tags.unique()

Because this doesn't give us any information, let's get rid of redundant data.

In [None]:
df.drop('tags', axis=1, inplace=True)

In [None]:
df.shape

## Clean `security_deposit`

In [None]:
df.security_deposit.value_counts()

Some values clearly are an amount/ For example:

In [None]:
df.security_deposit[df.security_deposit.str.contains("€").fillna(False)]

Other values are impossible to interpret.

In [None]:
df.security_deposit[df.security_deposit.str.contains("Kaution").fillna(False)]

Which values are numeric?

**First, let's remove the currency**

In [None]:
type(str(np.nan))

In [None]:
def remove_strings(value, strings_to_remove):
    """
    Remove strings in <strings_to_remove> from <value>. 
    Removes Euro/euro strings by default. 
    Assumes that each string occurs only once.
    """
    
    # return NaN if value is missing
    if pd.isna(value): return np.nan
    
    # cast to string so that string methods work
    value = str(value)
    
    # remove different strings
    for string in strings_to_remove:
        value = value.lower().replace(string, '').strip()
    
    return value

In [None]:
df['security_deposit'] = df.security_deposit.apply(remove_strings, args=(['€', 'Euro'],)).astype(str)

In [None]:
df.security_deposit.unique()

**Now, we need to convert the decimal into something a computer can better read**

In [None]:
def convert_string_to_float(value):
    """Replace a period with empty stringm and comma with period (decimal point)."""
    return value.replace(".", "").replace(",", ".")


def convert_decimal(value):
    """Convert numeric string to a decimal number."""
    
    # return NaN if missing
    if pd.isna(value):
        return np.nan       
    
    # check if wrong format w/o decimal i.e. 3.500
    if len(value.split('.')[-1]) == 3:
        return convert_string_to_float(value)
    
    # convert to float
    try:                               
        return float(value)
    except:
        return convert_string_to_float(value)

In [None]:
df['security_deposit'] = df.security_deposit.apply(convert_decimal)

In [None]:
df.security_deposit.unique()

In [None]:
df.rent.isnull().sum()

**And now, let's get a list of non-numeric values**

In [None]:
def is_float(value):
    try:
        float(value)
        return True
    except:
        return False

df.security_deposit[df.security_deposit.apply(lambda x: not is_float(x))].unique()

**Build a map of what the values should be. This is manually set because it's a small amount of data.**

In [None]:
str_security_deposit_map = {
    'drei': '3',
    '3x': '3',
    '9150.-': 9150,
    '2xnkm': '2',
    '6000.-': 6000,
    '3-fache': '3',
    '3nkm': '3',    
}

str_security_deposit_unclear = ['ja', 'kaution', 'kaution:']

In [None]:
df['security_deposit'] = df.security_deposit.apply(lambda x: str_security_deposit_map.get(x, x))

In [None]:
df.security_deposit.unique()

In [None]:
def convert_to_float(value):
    if is_float(value):
        return float(value)
    else:
        return np.nan

# Not storing in df, just want to see output
df.security_deposit[df.security_deposit.apply(lambda x: convert_to_float(x)) <= 3].value_counts()

Let's assume that the unclear values are three months, as it is the most common option.

In [None]:
unclear_security_deposit_map = {str_security_deposit: '3' for str_security_deposit in str_security_deposit_unclear}

In [None]:
df['security_deposit'] = df.security_deposit.apply(lambda x: unclear_security_deposit_map.get(x, x))

In [None]:
df.security_deposit.unique()

The values 1, 2 and 3 refer to the number of months rent as deposit, so let's calculate those values by multiplying with the rent

In [None]:
df.rent.dtype

**Now, convert the month duration columns (3/2/etc Months) to an actual value, based on the rent**

In [None]:
def convert_months_to_total(row):
    fl_dep = convert_to_float(row.security_deposit)
    if fl_dep <= 3:
        return float(row.rent * fl_dep)
    else:
        return fl_dep

In [None]:
df = df.assign(deposit_cleaned=df.apply(convert_months_to_total, axis = 1))

df[["security_deposit", "deposit_cleaned", "rent"]].head(50)

#### Check Distribution and Descriptive Stats

In [None]:
px.histogram(df, x="deposit_cleaned", marginal='box')

In [None]:
df.deposit_cleaned.agg(['mean', 'median', 'std', 'min', 'max'])

#### Filling the missing values intuitively

(Note: these are old advertisements and no longer available)

 - https://www.immobilienscout24.de/expose/116573177: has all of it's data mising, *will assume three months*
 - https://www.immobilienscout24.de/expose/115925878: has only some of its data missing, *will assume scraping error and therefore three months*
 - https://www.immobilienscout24.de/expose/114437800: has only deposit missing, *will assume there is no deposit*
 
**Because the data is missing for different reasons, we should use different fill strategies**

In [None]:
df[df.deposit_cleaned.isna()].head()

In [None]:
df['n_missing'] = df[df.columns[-14:]].isnull().sum(axis=1)

In [None]:
df[df.n_missing == 1].head(3)

In [None]:
def fill_missing_security_deposit(row):
  
    # if the value isn't missing, return the value
    if not pd.isna(row.deposit_cleaned):
        return row.deposit_cleaned
  
    # if there is only one missing value, return 0. Otherwise return 3 months of rent
    if row.n_missing == 1:
        return 0
    else:
        return row.rent * 3

In [None]:
df['deposit_cleaned'] = df.apply(fill_missing_security_deposit, axis=1)

In [None]:
px.histogram(df, x="deposit_cleaned", marginal='box')

In [None]:
df.deposit_cleaned.agg(['mean', 'median', 'std', 'min', 'max'])

In [None]:
df.security_deposit.isnull().sum()

In [None]:
tdf = pd.DataFrame({
    'val_1': [1, 2, 3, np.nan, np.nan],
    'val_2': [1, 2, 3, 4, np.nan],
})

In [None]:
tdf.dropna()

In [None]:
tdf.dropna(how='all')

In [None]:
tdf

In [None]:
tdf['val_1'] = tdf.val_1.fillna(-99)

In [None]:
tdf

## Recap:

 **Useful methods for dealing with missing data**:
 - `pd.isna(VALUE)` and  `pd.isnull(VALUE)`: check if a value is `NaN`, returns `boolean`
 - `df.COLUMN_NAME.isna()` and `df.COLUMN_NAME.isnull()`: check each value in column to see if it is `NaN`, returns `boolean`
 - `df.dropna()`: will drop **every** row that has a missing value
     - optional arguments:
         - `axis`: drop the column
         - `how`: `any` or `all` drop if `any` value is missing, or `all` values in that row/col are missing
 - `df.fillna(VALUE_TO_FILL_WITH)`: will fill every missing value (`NaN`) with the value of your choice
 
**NOTE**: na and null do the **exact** same thing in Pandas, even the docs are the same. The duplicate methods are carried over from `R` which treated `na` and `null` as different values.

## Clean `property_condition`

In [None]:
(df.isnull().sum() / len(df)).sort_values(ascending = False)

In [None]:
df.head()

#### Categorical Features need to be handled differently than Continuous

Common methods:
 - Mode: May bias the dataset towards that category
 - Intuitive: Set the category based on domain knowledge, could be a problem if you are wrong
 - An `Unknown` Category: A catch all category for missing categories, especially useful if there is a reason the data is missing

In [None]:
df.parking_space.isnull().sum() / len(df)

In [None]:
df.parking_space.mode()

In [None]:
df.property_condition.fillna('MISSING').value_counts()

In [None]:
df['property_condition'] = df.property_condition.fillna('Unknown')

In [None]:
px.bar(df.property_condition.value_counts().reset_index(), x='index', y='property_condition')

## Clean `cold_rent`

In [None]:
(df.isnull().sum() / len(df)).sort_values(ascending=False)

**Since we have the warm rent for every sample, rather than use the median or mean, a better method might be to get the average warm/cold rent ratio and apply that to all missing values**

In [None]:
df.cold_rent

In [None]:
# reuse functions from before
df['cold_rent'] = df.cold_rent.apply(lambda x: remove_strings(x, ["€", "Euro"])).astype(str)
df['cold_rent'] = df.cold_rent.apply(convert_decimal)

In [None]:
total_warm_and_cold = df[['rent', 'cold_rent']].dropna().astype(float).sum()
total_warm_and_cold

In [None]:
warm_cold_rent_ratio = total_warm_and_cold.cold_rent / total_warm_and_cold.rent
warm_cold_rent_ratio

In [None]:
px.histogram(df[['cold_rent']].astype(float).dropna(), x='cold_rent', marginal='box')

In [None]:
df['cold_rent'] = df.cold_rent.astype(float)

In [None]:
def get_cold_rent(row):
    if not pd.isna(row.cold_rent): 
        return float(row.cold_rent)
    else:
        return row.rent * warm_cold_rent_ratio

In [None]:
df['cold_rent'] = df.apply(get_cold_rent, axis=1)

In [None]:
px.histogram(df, x='cold_rent', marginal='box')

In [None]:
df.to_pickle('berlin_housing_with_scraped_class_cleaned.pkl')