In [106]:
import re
import pandas as pd
import numpy as np

In [107]:
data_date = "2023-06-25"

In [108]:
df = pd.read_csv(data_date + "_data.csv")

  df = pd.read_csv(data_date + "_data.csv")


In [109]:
# Define the list of columns to one-hot encode
one_hot_encode_cols = ['Typ bytu', 'Umístění objektu', 'Vybavení', 'Vlastnictví', 'Stav objektu', 'Stavba', 'Energetická náročnost budovy']

# Define the list of binary columns
binary_cols = ['Bazén', 'Půdní vestavba', 'Plyn', 'Bezbariérový', 'Terasa', 'Garáž', 'Výtah', 'Sklep', 'Parkování', 'Balkón']

# Define the list of distance columns
distance_cols = ['Večerka', 'Cukrárna', 'Divadlo', 'Veterinář',  'Hospoda', 'Kino', 'Hřiště', 'Přírodní zajímavost', 'Kulturní památka', 'Sportoviště', 'Bankomat', 'Školka', 'Pošta', 'Vlak', 'Bus MHD', 'Škola', 'Restaurace', 'Lékař', 'Tram', 'Obchod', 'Lékárna', 'Metro']


In [110]:
# Define columns to keep
cols_to_keep = ['url_id', 'Podlaží', 'Užitná plocha', 'Latitude', 'Longitude', 'Celková cena']

# Combine columns from the other lists
all_cols_to_keep = set(cols_to_keep + one_hot_encode_cols + binary_cols + distance_cols)

# Columns present in the DataFrame
all_cols_in_df = set(df.columns)

# Columns to drop
cols_to_drop = list(all_cols_in_df - all_cols_to_keep)

# Drop the columns
df = df.drop(columns=cols_to_drop)

# Clean 'url_id'
df['url_id'] = df['url_id'].str.replace('/cs/v2/estates/', '')

# Prepare 'Podlaží'
df['Podlaží'] = df['Podlaží'].str.extract('(\d+)', expand=False)
df['Podlaží'] = pd.to_numeric(df['Podlaží'], errors='coerce')

# Convert 'Celková cena' to numerical by removing whitespace and non-digit characters
df['Celková cena'] = df['Celková cena'].replace('\D', '', regex=True).astype(float)

# Correctly prepare 'Energetická náročnost budovy' by extracting the letter after "Třída"
df['Energetická náročnost budovy'] = df['Energetická náročnost budovy'].str.extract('Třída (\w)', expand=False)

In [111]:
# One-hot encode the specified columns
df = pd.get_dummies(df, columns=one_hot_encode_cols, drop_first=False)

# Convert the binary columns to 1 or 0
for col in binary_cols:
    df[col] = df[col].notnull().astype(int)

# Drop rows where both 'url_id' and 'Celková cena' are missing
df = df.dropna(subset=['url_id', 'Celková cena'], how='all')


# Fill missing values in 'Podlaží' with average
df.loc[:, 'Podlaží'] = df['Podlaží'].fillna(0)

In [112]:
# Fill missing values in distance columns with 10000 (representing 10 kilometers)
df[distance_cols] = df[distance_cols].fillna(10000)

def process_and_encode_distance(df, one_hot_encode_distance=False):
    # Define the bins and labels
    bins = [0, 500, 1000, 2000, 5000, float('inf')]
    labels = ['Very Close', 'Close', 'Medium Distance', 'Far', 'Very Far']

    # Apply binning to the distance columns
    for col in distance_cols:
        df[col] = pd.cut(df[col], bins=bins, labels=labels, right=False)
    
    # Conditionally one-hot encode
    if one_hot_encode_distance:
        df = pd.get_dummies(df, columns=distance_cols, drop_first=True)
    
    return df

df = process_and_encode_distance(df, one_hot_encode_distance=True)

In [113]:
# Užitná plocha sorted from lowest to highest
df['Užitná plocha'].sort_values()



8307       5.0
10233      5.0
12728      7.0
16406      8.0
15987      9.0
         ...  
3662     546.0
10184    645.0
5430     743.0
4488     758.0
18070    927.0
Name: Užitná plocha, Length: 19095, dtype: float64

In [114]:
# Drop 10 lowest values of 'Užitná plocha'
df = df.drop(df['Užitná plocha'].sort_values()[:50].index)

In [115]:
# Užitná plocha sorted from lowest to highest
df['Užitná plocha'].sort_values()

15037     11.0
703       11.0
18019     11.0
15751     11.0
13655     11.0
         ...  
3662     546.0
10184    645.0
5430     743.0
4488     758.0
18070    927.0
Name: Užitná plocha, Length: 19045, dtype: float64

In [116]:
def remove_outliers_using_iqr(df, column_name):
    """Remove outliers from a DataFrame using the IQR method for a specified column."""
    # Calculate Q1 and Q3
    Q1 = df[column_name].quantile(0.25)
    Q3 = df[column_name].quantile(0.75)
    
    # Calculate IQR
    IQR = Q3 - Q1
    
    # Define bounds
    lower_bound = Q1 - 1.5 * IQR
    upper_bound = Q3 + 1.5 * IQR
    
    # Filter out the outliers
    df_no_outliers = df[(df[column_name] >= lower_bound) & (df[column_name] <= upper_bound)]
    
    return df_no_outliers

# Remove outliers for "Celková cena"
df_cleaned_cena = remove_outliers_using_iqr(df, "Celková cena")

# Remove outliers for "Užitná plocha" on the already cleaned data
df_cleaned_both = remove_outliers_using_iqr(df_cleaned_cena, "Užitná plocha")

In [117]:
df_cleaned_both.head()


Unnamed: 0,url_id,Celková cena,Podlaží,Užitná plocha,Sklep,Parkování,Výtah,Latitude,Longitude,Terasa,...,Obchod_Far,Obchod_Very Far,Lékárna_Close,Lékárna_Medium Distance,Lékárna_Far,Lékárna_Very Far,Metro_Close,Metro_Medium Distance,Metro_Far,Metro_Very Far
0,722035788,6832000.0,2.0,48.0,1,1,1,50.049548,14.462156,0,...,False,False,False,True,False,False,False,True,False,False
2,2418832972,6361000.0,1.0,53.0,0,1,0,50.296096,16.359198,0,...,False,True,False,False,False,True,False,False,False,True
3,3930260556,2214000.0,5.0,35.0,1,1,1,50.723614,15.189542,0,...,False,False,False,False,False,False,False,False,False,True
5,3756315724,5643000.0,4.0,68.0,1,0,1,50.051077,14.298411,0,...,False,False,True,False,False,False,True,False,False,False
6,3955312204,5390000.0,4.0,38.0,1,0,1,50.049265,14.43826,0,...,False,False,False,False,False,False,False,False,False,False


In [118]:
missing_values = df_cleaned_both.isnull().sum()
missing_values = missing_values[missing_values > 0]

missing_values

Series([], dtype: int64)

In [119]:
df.shape

(19045, 143)

In [120]:
df_cleaned_both.shape

(15684, 143)

In [121]:
df_cleaned_both.to_csv(data_date + "_data_cleaned.csv", index=False)