In [1]:
# Import necessary libraries
import os
import pandas as pd
import numpy as np

In [2]:
# Get the current working directory
current_path = os.getcwd()

# Move two levels up to reach the project root (e.g., from Scr/GUI to project folder)
project_root = os.path.abspath(os.path.join(current_path, "..", ".."))
print("PROJECT ROOT:", project_root)

# Build the path to the raw data CSV file
csv_path = os.path.join(project_root, "Data", "raw", "flats.csv")

# Load the dataset into a DataFrame
df = pd.read_csv(csv_path)
print(f"Data loaded from: {csv_path}")

PROJECT ROOT: c:\Users\Dell\OneDrive\Desktop\Yerevan-Flat-Price-Prediction
Data loaded from: c:\Users\Dell\OneDrive\Desktop\Yerevan-Flat-Price-Prediction\Data\raw\flats.csv


In [3]:

# Rename columns from Armenian to English
new_columns = {
    "Շինության տիպ": "Building Type",
    "Նորակառույց": "New Building",
    "Վերելակ": "Elevator",
    "Հարկերի քանակ": "Number of Floors",
    "Ընդհանուր մակերես": "Total Area",
    "Սենյակների քանակ": "Number of Rooms",
    "Սանհանգույցների քանակ": "Number of Bathrooms",
    "Առաստաղի բարձրություն": "Ceiling Height",
    "Հարկ": "Floor",
    "Պատշգամբ": "Balcony",
    "Կահույք": "Furniture",
    "Վերանորոգում": "Renovation",
    "Գին": "Price",
    "Փողոց": "Street",
    "Region": "District"
}
df.rename(columns=new_columns, inplace=True)

#Define translation dictionaries for categorical values
translation_dicts = {
    "Building Type": {
        'Քարե': 'Stone',
        'Մոնոլիտ': 'Monolith',
        'Պանելային': 'Panel',
        'Փայտե': 'Wooden',
        'Կասետային': 'Cassette',
        'Աղյուսե': 'Brick'
    },
    "New Building": {
        'Ոչ': 'No',
        'Այո': 'Yes'
    },
    "Elevator": {
        'Առկա է': 'Available',
        'Առկա չէ': 'Not Available'
    },
    "Balcony": {
        'Առկա չէ': 'Not Available',
        'Փակ պատշգամբ': 'Closed Balcony',
        'Բաց պատշգամբ': 'Open Balcony',
        'Մի քանի պատշգամբ': 'Multiple Balconies'
    },
    "Furniture": {
        'Մասնակի կահույք': 'Partially Furnished',
        'Համաձայնությամբ': 'Negotiable',
        'Առկա է': 'Available',
        'Առկա չէ': 'Not Available'
    },
    "Renovation": {
        'Հին վերանորոգում': 'Old Renovation',
        'Կապիտալ վերանորոգված': 'Capital Renovated',
        'Դիզայներական ոճով վերանորոգված': 'Designer Renovated',
        'Մասնակի վերանորոգում': 'Partially Renovated',
        'Չվերանորոգված': 'Not Renovated',
        'Եվրովերանորոգված': 'Euro Renovation',
        'Կոսմետիկ վերանորոգում': 'Cosmetic Renovation'
    }
}

#Apply translations to categorical columns
for column, translation_map in translation_dicts.items():
    if column in df.columns:
        df[column] = df[column].replace(translation_map)

In [4]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 33719 entries, 0 to 33718
Data columns (total 15 columns):
 #   Column               Non-Null Count  Dtype  
---  ------               --------------  -----  
 0   Building Type        33719 non-null  object 
 1   New Building         33719 non-null  object 
 2   Elevator             33719 non-null  object 
 3   Number of Floors     33719 non-null  object 
 4   Total Area           33719 non-null  object 
 5   Number of Rooms      33719 non-null  object 
 6   Number of Bathrooms  33719 non-null  object 
 7   Ceiling Height       33719 non-null  object 
 8   Floor                33719 non-null  float64
 9   Balcony              33719 non-null  object 
 10  Furniture            33719 non-null  object 
 11  Renovation           33719 non-null  object 
 12  Price                33719 non-null  object 
 13  Street               33719 non-null  object 
 14  District             32536 non-null  object 
dtypes: float64(1), object(14)
memory usa

In [5]:
for col in df.columns:
    print(df[col].value_counts())
    print("----------------------")

Building Type
Monolith    13097
Panel       10285
Stone       10252
Cassette       69
Brick          12
Wooden          4
Name: count, dtype: int64
----------------------
New Building
No     20946
Yes    12773
Name: count, dtype: int64
----------------------
Elevator
Available        23443
Not Available    10276
Name: count, dtype: int64
----------------------
Number of Floors
5.0     5190
9.0     5096
5       2068
14.0    2044
4.0     1952
16.0    1808
9       1310
12.0     977
10.0     960
14       936
16       907
4        901
15.0     748
18.0     702
8.0      645
6.0      643
6        500
10       495
15       494
7.0      467
13.0     452
11.0     451
12       436
18       409
8        393
3.0      322
17.0     319
11       317
17       297
13       254
7        251
3        179
19.0     134
20.0      96
19        92
20        83
2.0       80
22.0      65
21.0      49
22        47
2         32
23.0      30
21        25
1.0       20
23        17
25         7
25.0       6
24.0     

In [6]:
# Remove the 'Street' column (not needed for prediction)
df = df.drop("Street", axis=1)

# Keep only rows where Building Type is one of the allowed types
allowed_types = ["Panel", "Monolith", "Stone"]
df = df[df["Building Type"].isin(allowed_types)]

# Replace 'Negotiable' furniture with 'Available'
df["Furniture"] = df["Furniture"].replace("Negotiable", "Available")

# Keep only rows where District is in the defined valid list
valid_districts = [
    "Kentron", "Arabkir", "Malatia-Sebastia", "Ajapnyak", "Qanaqer-Zeytun",
    "Nor Norq", "Shengavit", "Davtashen", "Erebuni", "Avan"
]
df = df[df["District"].isin(valid_districts)]

In [7]:
# Convert 'Number of Floors' to integer, invalid values become NaN
df["Number of Floors"] = pd.to_numeric(df["Number of Floors"], errors="coerce").astype(int)

# Extract numeric values from 'Total Area' (remove text like 'm²') and convert to float
df["Total Area"] = df["Total Area"].astype(str).str.extract(r"(\d+\.?\d*)").astype(float)

# Convert 'Number of Rooms' to float, invalid values become NaN
df["Number of Rooms"] = pd.to_numeric(df["Number of Rooms"], errors="coerce").astype(float)

# Remove '+' from 'Number of Bathrooms' and convert to float
df["Number of Bathrooms"] = df["Number of Bathrooms"].astype(str).str.rstrip("+").astype(float)

# Replace commas with dots in 'Ceiling Height', extract numeric part, and convert to float
df["Ceiling Height"] = (
    df["Ceiling Height"]
    .astype(str)
    .str.replace(",", ".", regex=False)
    .str.extract(r"(\d+\.?\d*)")
    .astype(float)
)

In [8]:
for col in df.columns:
    print(df[col].value_counts())
    print("----------------------")

Building Type
Monolith    12323
Panel        9993
Stone        9887
Name: count, dtype: int64
----------------------
New Building
No     20215
Yes    11988
Name: count, dtype: int64
----------------------
Elevator
Available        22373
Not Available     9830
Name: count, dtype: int64
----------------------
Number of Floors
5     6925
9     6219
14    2832
4     2696
16    2634
10    1405
12    1367
15    1189
6     1070
18    1070
8     1019
7      691
11     690
13     678
17     496
3      473
19     210
20     173
22     109
2      102
21      69
23      47
1       21
25      12
24       6
Name: count, dtype: int64
----------------------
Total Area
80.0     1182
75.0      880
90.0      830
100.0     738
70.0      732
         ... 
313.0       1
13.0        1
232.0       1
284.0       1
375.0       1
Name: count, Length: 274, dtype: int64
----------------------
Number of Rooms
3.0    14421
2.0     9723
4.0     5110
1.0     2144
5.0      608
6.0      153
7.0       34
Name: count, dty

In [9]:
# Count how many price entries are in USD and AMD
usd_count = df["Price"].astype(str).str.contains(r"\$").sum()
amd_count = df["Price"].astype(str).str.contains("֏").sum()

# Print the counts of USD and AMD prices
print("USD prices ($):", usd_count)
print("AMD prices (֏):", amd_count)

USD prices ($): 31409
AMD prices (֏): 794


In [10]:
# Convert 'Price' column to string for processing
price_str = df["Price"].astype(str)

# Convert AMD prices to USD
amd_mask = price_str.str.contains("֏")  # Identify rows with AMD currency
df.loc[amd_mask, "Price"] = (
    price_str[amd_mask]
    .str.replace(r"[^\d.]", "", regex=True)  # Remove non-numeric characters
    .astype(float) / 385                    # Convert AMD to USD (fixed rate: 1 USD = 385 AMD)
).round(2)

# Clean USD prices
usd_mask = price_str.str.contains(r"\$")  # Identify rows with USD currency
df.loc[usd_mask, "Price"] = (
    price_str[usd_mask]
    .str.replace(r"[^\d.]", "", regex=True)  # Remove '$' and other non-numeric symbols
    .astype(float)
)

# Ensure 'Price' column is float
df["Price"] = df["Price"].astype(float)

In [11]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 32203 entries, 0 to 33718
Data columns (total 14 columns):
 #   Column               Non-Null Count  Dtype  
---  ------               --------------  -----  
 0   Building Type        32203 non-null  object 
 1   New Building         32203 non-null  object 
 2   Elevator             32203 non-null  object 
 3   Number of Floors     32203 non-null  int64  
 4   Total Area           32203 non-null  float64
 5   Number of Rooms      32193 non-null  float64
 6   Number of Bathrooms  32203 non-null  float64
 7   Ceiling Height       32203 non-null  float64
 8   Floor                32203 non-null  float64
 9   Balcony              32203 non-null  object 
 10  Furniture            32203 non-null  object 
 11  Renovation           32203 non-null  object 
 12  Price                32203 non-null  float64
 13  District             32203 non-null  object 
dtypes: float64(6), int64(1), object(7)
memory usage: 3.7+ MB


In [12]:
df.isna().sum()

Building Type           0
New Building            0
Elevator                0
Number of Floors        0
Total Area              0
Number of Rooms        10
Number of Bathrooms     0
Ceiling Height          0
Floor                   0
Balcony                 0
Furniture               0
Renovation              0
Price                   0
District                0
dtype: int64

In [13]:
df.duplicated().sum()

np.int64(12814)

In [14]:
df = df.dropna(ignore_index=True)
df = df.drop_duplicates(ignore_index=True)

In [15]:
# Count listings priced under 100k (USD)
price_under_100k = {}
for pr in range(30000, 100001, 10000):  # Ranges: 30k, 40k, ... 100k
    price_under_100k[len(df[df["Price"] <= pr])] = pr

print(price_under_100k)

# Count listings priced above 200k (USD)
price_after_200k = {}
for pr in range(200000, 2000001, 50000):  # Ranges: 200k, 250k, ... 2M
    price_after_200k[len(df[df["Price"] >= pr])] = pr

print(price_after_200k)

{42: 30000, 106: 40000, 236: 50000, 631: 60000, 1459: 70000, 2730: 80000, 4419: 90000, 5892: 100000}
{5902: 200000, 3898: 250000, 2559: 300000, 1874: 350000, 1340: 400000, 1066: 450000, 819: 500000, 655: 550000, 508: 600000, 393: 650000, 286: 700000, 215: 750000, 161: 800000, 147: 850000, 112: 900000, 95: 950000, 85: 1000000, 73: 1050000, 65: 1100000, 52: 1150000, 42: 1200000, 29: 1250000, 21: 1300000, 18: 1400000, 14: 1450000, 13: 1500000, 10: 1550000, 9: 1650000, 6: 1800000, 3: 1900000, 1: 2000000}


In [16]:
# Keep only apartments priced between 60,000 and 280,000 USD
df = df[(df["Price"] >= 60000) & (df["Price"] <= 280000)]

In [17]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 16013 entries, 0 to 19377
Data columns (total 14 columns):
 #   Column               Non-Null Count  Dtype  
---  ------               --------------  -----  
 0   Building Type        16013 non-null  object 
 1   New Building         16013 non-null  object 
 2   Elevator             16013 non-null  object 
 3   Number of Floors     16013 non-null  int64  
 4   Total Area           16013 non-null  float64
 5   Number of Rooms      16013 non-null  float64
 6   Number of Bathrooms  16013 non-null  float64
 7   Ceiling Height       16013 non-null  float64
 8   Floor                16013 non-null  float64
 9   Balcony              16013 non-null  object 
 10  Furniture            16013 non-null  object 
 11  Renovation           16013 non-null  object 
 12  Price                16013 non-null  float64
 13  District             16013 non-null  object 
dtypes: float64(6), int64(1), object(7)
memory usage: 1.8+ MB


In [18]:
# Count apartments with total area <= specific values (10, 15, ..., 50 m²)
area_befor50 = {}
for ar in range(10, 51, 5):  # Step of 5 m²
    area_befor50[len(df[df["Total Area"] <= ar])] = ar

print(area_befor50)

# Count apartments with total area >= specific values (100, 110, ..., 300 m²)
area_after100 = {}
for ar in range(100, 301, 10):  # Step of 10 m²
    area_after100[len(df[df["Total Area"] >= ar])] = ar

print(area_after100)

{1: 20, 9: 25, 76: 30, 203: 35, 665: 40, 1464: 45, 2323: 50}
{2189: 100, 1217: 110, 727: 120, 365: 130, 199: 140, 132: 150, 82: 160, 50: 170, 42: 180, 32: 190, 25: 200, 17: 210, 10: 220, 7: 230, 6: 250, 4: 260, 3: 300}


In [19]:
# Keep only apartments with total area between 30 and 130 m²
df = df[(df["Total Area"] >= 30) & (df["Total Area"] <= 130)]

In [20]:
print(len(df[df["Number of Rooms"]>=5]))
print(len(df[df["Number of Rooms"]==4]))

97
1898


In [21]:
df = df[df["Number of Rooms"] <=4]

In [22]:
df[df["Floor"]>15].shape

(239, 14)

In [23]:
df = df[df["Floor"]<=15]

In [24]:
df[df["Number of Bathrooms"]>=4].shape

(0, 14)

In [25]:
df[df["Number of Bathrooms"]==3].shape

(53, 14)

In [26]:
df = df[df["Number of Bathrooms"]<3]

In [27]:
# Get current path
current_path = os.getcwd()

# Go 2 folders back
project_root = os.path.abspath(os.path.join(current_path, "..", ".."))

# Path to processed folder
save_path = os.path.join(project_root, "Data", "processed", "processed_df.csv")

# Save CSV
df.to_csv(save_path, index=False, encoding='utf-8-sig')
print(f"Processed data saved to: {save_path}")

Processed data saved to: c:\Users\Dell\OneDrive\Desktop\Yerevan-Flat-Price-Prediction\Data\processed\processed_df.csv
