In [6]:
import pandas as pd
import numpy as np
import os
from google.colab import drive

drive.mount('/content/drive')

base_path = '/content/drive/MyDrive/AI_Bootcamp/Divar_Project/data/'
input_file = f'{base_path}Divar.csv'
city_file = f'{base_path}iran_city_classification.csv'

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [14]:
#Load Data
dtype_map = {
    'cat2_slug': 'category',
    'cat3_slug': 'category',
    'city_slug': 'category',
    'neighborhood_slug': 'category',
    'deed_type': 'category',
    'user_type': 'category'
}

df = pd.read_csv(input_file, dtype=dtype_map, low_memory=False)

# Convert Date column to datetime
df['created_at_month'] = pd.to_datetime(df['created_at_month'], errors='coerce')

In [15]:
#Data Cleaning & Imputation

#Boolean Columns (Handling 'unselect' and NaNs)
bool_cols = [
    'has_balcony', 'has_elevator', 'has_warehouse', 'has_parking',
    'has_business_deed', 'is_rebuilt', 'has_pool', 'has_sauna',
    'has_jacuzzi', 'has_security_guard'
]

for col in bool_cols:
    if col in df.columns:
        # Convert to string, lowercase, then check if it equals 'true'
        df[col] = df[col].astype(str).str.lower() == 'true'

#Fixing Persian Numbers in Construction Year
def fix_persian_year(x):
    if pd.isna(x):
        return x
    x = str(x).strip()
    # Handle "Before 1370" -> convert to 1369
    if 'قبل' in x:
        return 1369

    # Replace Persian digits with English digits
    persian_digits = '۰۱۲۳۴۵۶۷۸۹'
    english_digits = '0123456789'
    translation_table = str.maketrans(persian_digits, english_digits)
    x = x.translate(translation_table)

    # Return as numeric, or NaN if it's still text
    try:
        return int(float(x))
    except:
        return np.nan

df['construction_year'] = df['construction_year'].apply(fix_persian_year)


#Impute 'total_floors_count' for Villa
if 'property_type' in df.columns:
    villa_mask = df['property_type'].str.contains('villa|house', case=False, na=False)
    df.loc[villa_mask & df['total_floors_count'].isna(), 'total_floors_count'] = 1

#Imputation for Construction Year
df['construction_year'] = df['construction_year'].fillna(
    df.groupby('neighborhood_slug')['construction_year'].transform('median')
)

print(f"Missing years filled.")

Missing years filled.


  df.groupby('neighborhood_slug')['construction_year'].transform('median')


In [16]:
# Outlier Removal & Feature Engineering

# Remove Outliers (Years)
# Keep years between 1300 and 1404
df = df[(df['construction_year'] >= 1300) & (df['construction_year'] <= 1404)]

# Remove Logical Errors (Floor > Total Floors)
valid_floors = (df['floor'].notna()) & (df['total_floors_count'].notna())
# We keep rows where floor is valid OR floor <= total_floors
df = df[~valid_floors | (df['floor'] <= df['total_floors_count'])]

# Create 'Building Age'
df['building_age'] = 1404 - df['construction_year']

# Create 'Luxury Score'
df['luxury_score'] = (
    df['has_pool'].astype(int) +
    df['has_sauna'].astype(int) +
    df['has_jacuzzi'].astype(int) +
    df['has_security_guard'].astype(int)
)

# Create 'is_metropolis' for Hypothesis Testing
# We define the major metropolises manually to avoid translation issues
metropolises = ['tehran', 'mashhad', 'isfahan', 'karaj', 'shiraz', 'tabriz', 'ahvaz', 'qom']
df['is_metropolis'] = df['city_slug'].isin(metropolises).astype(int)

print("New Features: 'building_age', 'luxury_score', 'is_metropolis'")

New Features: 'building_age', 'luxury_score', 'is_metropolis'


In [17]:
# Saving Cleaned Data
output_file = f'{base_path}divar_cleaned_ABT.parquet'
df.to_parquet(output_file, index=False)