In [13]:
import pandas as pd
import numpy as np

In [14]:
df = pd.read_csv('test_2023-08-13_data_cleaned.csv')

In [2]:
# Define the latitude and longitude for each county seat in the Czech Republic
county_seats = {
    "Praha": (50.0755, 14.4378),
    "České Budějovice": (48.9737, 14.4875),
    "Brno": (49.1951, 16.6068),
    "Karlovy Vary": (50.2313, 12.8718),
    "Jihlava": (49.3961, 15.5912),
    "Hradec Králové": (50.2095, 15.8328),
    "Liberec": (50.7667, 15.0562),
    "Ostrava": (49.8209, 18.2625),
    "Olomouc": (49.5938, 17.2509),
    "Pardubice": (50.0405, 15.7792),
    "Plzeň": (49.7384, 13.3736),
    "Ústí nad Labem": (50.6606, 14.0400),
    "Zlín": (49.2269, 17.6689),
    "Moravskoslezský": (49.8209, 18.2625)
}

# Define central points for the north, west, south, and east parts of Prague
prague_points = {
    "Prague North": (50.1255, 14.4378),
    "Prague West": (50.0755, 14.3378),
    "Prague South": (50.0255, 14.4378),
    "Prague East": (50.0755, 14.5378)
}

# Combine the two dictionaries for all reference points
reference_points = {**county_seats, **prague_points}


In [5]:
def haversine_distance(lat1, lon1, lat2, lon2):
    """
    Calculate the Haversine distance between two points on the earth (specified in decimal degrees)
    """
    # Convert decimal degrees to radians
    lat1, lon1, lat2, lon2 = map(np.radians, [lat1, lon1, lat2, lon2])

    # Haversine formula
    dlat = lat2 - lat1
    dlon = lon2 - lon1
    a = np.sin(dlat/2.0)**2 + np.cos(lat1) * np.cos(lat2) * np.sin(dlon/2.0)**2
    c = 2 * np.arcsin(np.sqrt(a))
    r = 6371  # Radius of Earth in kilometers
    return c * r

# Calculate the distance from each listing to each reference point
for point_name, (lat, lon) in reference_points.items():
    df[f"Distance_to_{point_name.replace(' ', '_')}"] = haversine_distance(df["Latitude"], df["Longitude"], lat, lon)

# Display the first few rows of the augmented dataset
df.iloc[:, -18:].head()


Unnamed: 0,Distance_to_Praha,Distance_to_České_Budějovice,Distance_to_Brno,Distance_to_Karlovy_Vary,Distance_to_Jihlava,Distance_to_Hradec_Králové,Distance_to_Liberec,Distance_to_Ostrava,Distance_to_Olomouc,Distance_to_Pardubice,Distance_to_Plzeň,Distance_to_Ústí_nad_Labem,Distance_to_Zlín,Distance_to_Moravskoslezský,Distance_to_Prague_North,Distance_to_Prague_West,Distance_to_Prague_South,Distance_to_Prague_East
0,18.12376,137.522556,201.42443,100.213082,130.051,110.53901,83.344928,287.726477,223.328155,108.429132,82.874577,53.626842,266.734091,287.726477,14.194164,14.783174,22.743979,23.23746
1,18.12376,137.522556,201.42443,100.213082,130.051,110.53901,83.344928,287.726477,223.328155,108.429132,82.874577,53.626842,266.734091,287.726477,14.194164,14.783174,22.743979,23.23746
2,18.12376,137.522556,201.42443,100.213082,130.051,110.53901,83.344928,287.726477,223.328155,108.429132,82.874577,53.626842,266.734091,287.726477,14.194164,14.783174,22.743979,23.23746
3,18.12376,137.522556,201.42443,100.213082,130.051,110.53901,83.344928,287.726477,223.328155,108.429132,82.874577,53.626842,266.734091,287.726477,14.194164,14.783174,22.743979,23.23746
4,18.12376,137.522556,201.42443,100.213082,130.051,110.53901,83.344928,287.726477,223.328155,108.429132,82.874577,53.626842,266.734091,287.726477,14.194164,14.783174,22.743979,23.23746


In [7]:
df.head(20)

Unnamed: 0,url_id,Celková cena,Podlaží,Užitná plocha,Parkování,Garáž,Latitude,Longitude,Terasa,Balkón,...,Distance_to_Olomouc,Distance_to_Pardubice,Distance_to_Plzeň,Distance_to_Ústí_nad_Labem,Distance_to_Zlín,Distance_to_Moravskoslezský,Distance_to_Prague_North,Distance_to_Prague_West,Distance_to_Prague_South,Distance_to_Prague_East
0,2059674700,2857000.0,1.0,47,1,1,50.203105,14.279582,0,0,...,223.328155,108.429132,82.874577,53.626842,266.734091,287.726477,14.194164,14.783174,22.743979,23.23746
1,2765366348,2163000.0,3.0,29,1,1,50.203105,14.279582,0,0,...,223.328155,108.429132,82.874577,53.626842,266.734091,287.726477,14.194164,14.783174,22.743979,23.23746
2,3099862092,1956000.0,3.0,26,1,1,50.203105,14.279582,0,0,...,223.328155,108.429132,82.874577,53.626842,266.734091,287.726477,14.194164,14.783174,22.743979,23.23746
3,886318156,2100000.0,2.0,27,1,1,50.203105,14.279582,1,0,...,223.328155,108.429132,82.874577,53.626842,266.734091,287.726477,14.194164,14.783174,22.743979,23.23746
4,1892951116,2569000.0,3.0,37,1,1,50.203105,14.279582,1,1,...,223.328155,108.429132,82.874577,53.626842,266.734091,287.726477,14.194164,14.783174,22.743979,23.23746
5,550773836,1956000.0,3.0,26,1,1,50.203105,14.279582,0,0,...,223.328155,108.429132,82.874577,53.626842,266.734091,287.726477,14.194164,14.783174,22.743979,23.23746
6,2211718220,2028000.0,3.0,27,1,1,50.203105,14.279582,0,0,...,223.328155,108.429132,82.874577,53.626842,266.734091,287.726477,14.194164,14.783174,22.743979,23.23746
7,1752917068,2740000.0,2.0,43,1,1,50.203105,14.279582,0,0,...,223.328155,108.429132,82.874577,53.626842,266.734091,287.726477,14.194164,14.783174,22.743979,23.23746
8,3744162892,2740000.0,1.0,43,1,1,50.203105,14.279582,0,0,...,223.328155,108.429132,82.874577,53.626842,266.734091,287.726477,14.194164,14.783174,22.743979,23.23746
9,2604360780,2839000.0,2.0,46,1,1,50.203105,14.279582,1,1,...,223.328155,108.429132,82.874577,53.626842,266.734091,287.726477,14.194164,14.783174,22.743979,23.23746


In [15]:
def remove_outliers_using_iqr(df, column_name):
    """Remove outliers from a DataFrame using the IQR method for a specified column."""
    # Calculate Q1 and Q3
    Q1 = df[column_name].quantile(0.25)
    Q3 = df[column_name].quantile(0.75)
    
    # Calculate IQR
    IQR = Q3 - Q1
    
    # Define bounds
    lower_bound = Q1 - 1.5 * IQR
    upper_bound = Q3 + 1.5 * IQR
    
    # Filter out the outliers
    df_no_outliers = df[(df[column_name] >= lower_bound) & (df[column_name] <= upper_bound)]
    
    return df_no_outliers

# Remove outliers for "Celková cena"
df_cleaned_cena = remove_outliers_using_iqr(df, "Celková cena")

# Remove outliers for "Užitná plocha" on the already cleaned data
df_cleaned_both = remove_outliers_using_iqr(df_cleaned_cena, "Užitná plocha")


In [17]:
df_cleaned_both.to_csv("test_2023-08-13_data_cleaned_test.csv", index=False)