In [601]:
from sklearn.datasets import fetch_california_housing
import pandas as pd
import re

# California Housing dataset

In [495]:
california = fetch_california_housing()

In [497]:
df_cali  = pd.DataFrame(california["data"], columns = california["feature_names"])
df_cali ["Price"] = california["target"]

In [499]:
display(df_cali.head())

Unnamed: 0,MedInc,HouseAge,AveRooms,AveBedrms,Population,AveOccup,Latitude,Longitude,Price
0,8.3252,41.0,6.984127,1.02381,322.0,2.555556,37.88,-122.23,4.526
1,8.3014,21.0,6.238137,0.97188,2401.0,2.109842,37.86,-122.22,3.585
2,7.2574,52.0,8.288136,1.073446,496.0,2.80226,37.85,-122.24,3.521
3,5.6431,52.0,5.817352,1.073059,558.0,2.547945,37.85,-122.25,3.413
4,3.8462,52.0,6.281853,1.081081,565.0,2.181467,37.85,-122.25,3.422


In [430]:
# dataset description
print(california_housing.DESCR)

.. _california_housing_dataset:

California Housing dataset
--------------------------

**Data Set Characteristics:**

:Number of Instances: 20640

:Number of Attributes: 8 numeric, predictive attributes and the target

:Attribute Information:
    - MedInc        median income in block group
    - HouseAge      median house age in block group
    - AveRooms      average number of rooms per household
    - AveBedrms     average number of bedrooms per household
    - Population    block group population
    - AveOccup      average number of household members
    - Latitude      block group latitude
    - Longitude     block group longitude

:Missing Attribute Values: None

This dataset was obtained from the StatLib repository.
https://www.dcc.fc.up.pt/~ltorgo/Regression/cal_housing.html

The target variable is the median house value for California districts,
expressed in hundreds of thousands of dollars ($100,000).

This dataset was derived from the 1990 U.S. census, using one row per ce

In [501]:
df_cali['Price'] = df_cali['Price'] * 100000

In [503]:
df_cali.drop(['Latitude', 'Longitude', 'Population'], axis=1, inplace=True)

In [505]:
df_cali['Country'] = 'USA'

In [507]:
df_cali.head()

Unnamed: 0,MedInc,HouseAge,AveRooms,AveBedrms,AveOccup,Price,Country
0,8.3252,41.0,6.984127,1.02381,2.555556,452600.0,USA
1,8.3014,21.0,6.238137,0.97188,2.109842,358500.0,USA
2,7.2574,52.0,8.288136,1.073446,2.80226,352100.0,USA
3,5.6431,52.0,5.817352,1.073059,2.547945,341300.0,USA
4,3.8462,52.0,6.281853,1.081081,2.181467,342200.0,USA


# Real State dataset

In [649]:
real_estate_df = pd.read_csv('../data/raw/RealEstateDataset.csv', delimiter=';')

display(real_estate_df.head())

Unnamed: 0,name_nsi,price,index,environment,quality_of_living,safety,transport,services,relax,condition,...,last_reconstruction,total_floors,floor,lift,balkonies,loggia,cellar,type,rooms,district
0,Semerovo,42000,,,,,,,,Original condition,...,,,,0,,,0,3-room apartment,3,Nové Zámky
1,Semerovo,42000,,,,,,,,Original condition,...,,2.0,,0,,,0,3-room apartment,3,Nové Zámky
2,Štúrovo,107000,83.0,,,,,,,Partial reconstruction,...,,5.0,3.0,0,,,0,1-room apartment,1,Nové Zámky
3,Štúrovo,105000,,,,,,,,Complete reconstruction,...,,7.0,4.0,1,,,0,3-room apartment,3,Nové Zámky
4,Štúrovo,82000,,,,,,,,Partial reconstruction,...,2018.0,,2.0,0,,,0,2-room apartment,2,Nové Zámky


In [651]:
real_estate_df.columns

Index(['name_nsi', 'price', 'index', 'environment', 'quality_of_living',
       'safety', 'transport', 'services', 'relax', 'condition', 'area',
       'energy_costs', 'provision', 'certificate', 'construction_type',
       'orientation', 'year_built', 'last_reconstruction', 'total_floors',
       'floor', 'lift', 'balkonies', 'loggia', 'cellar', 'type', 'rooms',
       'district'],
      dtype='object')

In [653]:
real_estate_df['type'].unique()

array(['3-room apartment', '1-room apartment', '2-room apartment',
       '4-room apartment', 'Studio', '5 or more room apartment',
       'Two-room apartment'], dtype=object)

In [655]:
real_estate_df.drop(['name_nsi', 'index', 'environment', 'quality_of_living', 'safety', 'transport', 'services', 'relax', 'energy_costs', 'condition', 'provision', 'orientation', 'lift', 'balkonies', 'loggia', 'cellar', 'district', 'certificate'], axis=1, inplace=True)

In [657]:
real_estate_df['Country'] = 'Slovakia'

In [659]:
display(real_estate_df.head())

Unnamed: 0,price,area,construction_type,year_built,last_reconstruction,total_floors,floor,type,rooms,Country
0,42000,58,,,,,,3-room apartment,3,Slovakia
1,42000,58,Brick,,,2.0,,3-room apartment,3,Slovakia
2,107000,40,,,,5.0,3.0,1-room apartment,1,Slovakia
3,105000,76,,,,7.0,4.0,3-room apartment,3,Slovakia
4,82000,63,,,2018.0,,2.0,2-room apartment,2,Slovakia


<br>
<br>

# Data Integration

### California dataset columns to consider:

- **MedInc** (Median Income)
- **HouseAge** (Median Age of Houses)
- **AveRooms**
- **AveBedrms** (Average Rooms/Bedrooms per Dwelling)
- **MedHouseVal** (Median House Value)
- **Price**

## Mapping Slovak columns

### Price
Currency conversion

In [661]:
# Convert Slovak prices from EUR to USD (1 EUR = 1.09 USD)
exchange_rate = 1.09
real_estate_df['Price'] = real_estate_df['price'] * exchange_rate

In [663]:
print(real_estate_df[['price', 'Price']].head())

    price     Price
0   42000   45780.0
1   42000   45780.0
2  107000  116630.0
3  105000  114450.0
4   82000   89380.0


In [665]:
real_estate_df.drop(['price'], axis=1, inplace=True)

In [669]:
real_estate_df.head()

Unnamed: 0,area,construction_type,year_built,last_reconstruction,total_floors,floor,type,rooms,Country,Price
0,58,,,,,,3-room apartment,3,Slovakia,45780.0
1,58,Brick,,,2.0,,3-room apartment,3,Slovakia,45780.0
2,40,,,,5.0,3.0,1-room apartment,1,Slovakia,116630.0
3,76,,,,7.0,4.0,3-room apartment,3,Slovakia,114450.0
4,63,,,2018.0,,2.0,2-room apartment,2,Slovakia,89380.0


### MedIncome
- Calculate statistics from the California dataset:

In [675]:
# get basic 'MedInc' statistics from df_cali dataset
medinc_min = df_cali['MedInc'].min()
medinc_max = df_cali['MedInc'].max()
medinc_mean = df_cali['MedInc'].mean()
medinc_quantiles = df_cali['MedInc'].quantile([0.25, 0.5, 0.75]).values


print(f"Min: {medinc_min}, Max: {medinc_max}, Mean: {medinc_mean}, Quantiles: {medinc_quantiles}")

Min: 0.4999, Max: 15.0001, Mean: 3.8706710029069766, Quantiles: [2.5634  3.5348  4.74325]


- Define Price bins for Slovakia based on the 'MedInc' statistics from California:

In [679]:
# Calculate quantiles for property prices in the Slovak dataset
# These quantiles will help me to classify properties into different price categories
slovak_price_quantiles = real_estate_df['Price'].quantile([0.25, 0.5, 0.75]).values

# Define the bins for Slovak property prices using calculated quantiles
price_bins = [0, slovak_price_quantiles[0], slovak_price_quantiles[1], slovak_price_quantiles[2], float('inf')]

In [681]:
# Assign income labels to each bin based on California 'MedInc' quantiles
# The labels are taken from the California quantiles to align the income levels conceptually
income_labels = [medinc_quantiles[0], medinc_quantiles[1], medinc_quantiles[2], medinc_max]

In [683]:
# Use pd.cut() to categorize Slovak property prices into bins and assign income labels
# The 'MedInc' column in the Slovak dataset will now have values that align with California's income levels
real_estate_df['MedInc'] = pd.cut(real_estate_df['Price'], bins=price_bins, labels=income_labels).astype(float)

In [685]:
print(real_estate_df[['Price', 'MedInc']].head())

      Price  MedInc
0   45780.0  2.5634
1   45780.0  2.5634
2  116630.0  3.5348
3  114450.0  3.5348
4   89380.0  2.5634


In [687]:
display(real_estate_df.head())

Unnamed: 0,area,construction_type,year_built,last_reconstruction,total_floors,floor,type,rooms,Country,Price,MedInc
0,58,,,,,,3-room apartment,3,Slovakia,45780.0,2.5634
1,58,Brick,,,2.0,,3-room apartment,3,Slovakia,45780.0,2.5634
2,40,,,,5.0,3.0,1-room apartment,1,Slovakia,116630.0,3.5348
3,76,,,,7.0,4.0,3-room apartment,3,Slovakia,114450.0,3.5348
4,63,,,2018.0,,2.0,2-room apartment,2,Slovakia,89380.0,2.5634


### HouseAge

In [689]:
# Create 'HouseAge' using 'last_reconstruction' as a proxy
real_estate_df['HouseAge'] = 2024 - real_estate_df['last_reconstruction'].fillna(2000)


### AveRooms

In [691]:
# Create 'AveRooms' using the 'rooms' column
real_estate_df['AveRooms'] = real_estate_df['rooms']

### AveBedrms

In [693]:
# Extract number of bedrooms from the 'type' column to create AveBedrms
# Using regex to extract the first number in the 'type' column
real_estate_df['AveBedrms'] = real_estate_df['type'].apply(
    lambda x: int(re.search(r'\d+', x).group()) if pd.notna(x) and re.search(r'\d+', x) else None
)

In [697]:
real_estate_df.drop(['construction_type', 'year_built', 'last_reconstruction', 'rooms'], axis=1, inplace=True)

In [699]:
real_estate_df.head()

Unnamed: 0,area,total_floors,floor,type,Country,Price,MedInc,HouseAge,AveRooms,AveBedrms
0,58,,,3-room apartment,Slovakia,45780.0,2.5634,24.0,3,3.0
1,58,2.0,,3-room apartment,Slovakia,45780.0,2.5634,24.0,3,3.0
2,40,5.0,3.0,1-room apartment,Slovakia,116630.0,3.5348,24.0,1,1.0
3,76,7.0,4.0,3-room apartment,Slovakia,114450.0,3.5348,24.0,3,3.0
4,63,,2.0,2-room apartment,Slovakia,89380.0,2.5634,6.0,2,2.0


### AveOccup
Defined based on AveBedrms and AveRooms:

In [704]:
# Calculate 'AveOccup' based on 'AveBedrms' and 'AveRooms'
real_estate_df['AveOccup'] = (1.2 * real_estate_df['AveBedrms']) + (0.5 * (real_estate_df['AveRooms'] - real_estate_df['AveBedrms']))

print(real_estate_df[['AveBedrms', 'AveRooms', 'AveOccup']].head())

   AveBedrms  AveRooms  AveOccup
0        3.0         3       3.6
1        3.0         3       3.6
2        1.0         1       1.2
3        3.0         3       3.6
4        2.0         2       2.4


#### Final cleaning

In [706]:
# Reorganize the columns to match the order of the updated California dataset
column_order = ['MedInc', 'HouseAge', 'AveRooms', 'AveBedrms', 'AveOccup', 'Price', 'Country']
real_estate_df = real_estate_df[column_order]

In [708]:
real_estate_df.head()

Unnamed: 0,MedInc,HouseAge,AveRooms,AveBedrms,AveOccup,Price,Country
0,2.5634,24.0,3,3.0,3.6,45780.0,Slovakia
1,2.5634,24.0,3,3.0,3.6,45780.0,Slovakia
2,3.5348,24.0,1,1.0,1.2,116630.0,Slovakia
3,3.5348,24.0,3,3.0,3.6,114450.0,Slovakia
4,2.5634,6.0,2,2.0,2.4,89380.0,Slovakia


In [710]:
df_cali.head()

Unnamed: 0,MedInc,HouseAge,AveRooms,AveBedrms,AveOccup,Price,Country
0,8.3252,41.0,6.984127,1.02381,2.555556,452600.0,USA
1,8.3014,21.0,6.238137,0.97188,2.109842,358500.0,USA
2,7.2574,52.0,8.288136,1.073446,2.80226,352100.0,USA
3,5.6431,52.0,5.817352,1.073059,2.547945,341300.0,USA
4,3.8462,52.0,6.281853,1.081081,2.181467,342200.0,USA


## Concatenate the Datasets

In [715]:
combined_df = pd.concat([real_estate_df, df_cali], ignore_index=True)
display(combined_df)

Unnamed: 0,MedInc,HouseAge,AveRooms,AveBedrms,AveOccup,Price,Country
0,2.5634,24.0,3.000000,3.000000,3.600000,45780.0,Slovakia
1,2.5634,24.0,3.000000,3.000000,3.600000,45780.0,Slovakia
2,3.5348,24.0,1.000000,1.000000,1.200000,116630.0,Slovakia
3,3.5348,24.0,3.000000,3.000000,3.600000,114450.0,Slovakia
4,2.5634,6.0,2.000000,2.000000,2.400000,89380.0,Slovakia
...,...,...,...,...,...,...,...
36038,1.5603,25.0,5.045455,1.133333,2.560606,78100.0,USA
36039,2.5568,18.0,6.114035,1.315789,3.122807,77100.0,USA
36040,1.7000,17.0,5.205543,1.120092,2.325635,92300.0,USA
36041,1.8672,18.0,5.329513,1.171920,2.123209,84700.0,USA


In [717]:
combined_df.to_csv('../data/clean/combined_dataset.csv', index=False)