In [49]:
# import requirements 
import pandas as pd
import numpy as np
import warnings
warnings.filterwarnings('ignore')

# Load dataset
data = pd.read_csv('Divar.csv')
data.shape


(1000000, 61)

In [50]:
# outlier handling and cleaning for numeric columns

def clean_numeric_columns(df):
    df_clean = df.copy()
    
    # 1. outlier handling for financial columns
    financial_cols = ['price_value', 'rent_value', 'credit_value']
    
    for col in financial_cols:
        if col in df_clean.columns:
            # zero values to NaN
            zero_mask = df_clean[col] == 0
            if zero_mask.sum() > 0:
                print(f"{col}: {zero_mask.sum()} zero values -> NaN")
                df_clean.loc[zero_mask, col] = np.nan
            
            # Winsorization for outliers
            lower_limit = df_clean[col].quantile(0.01)
            upper_limit = df_clean[col].quantile(0.99)
            
            outliers_mask = (df_clean[col] < lower_limit) | (df_clean[col] > upper_limit)
            if outliers_mask.sum() > 0:
                print(f"{col}: {outliers_mask.sum()} outlier -> winsorize")
                df_clean[col] = np.clip(df_clean[col], lower_limit, upper_limit)
    
    return df_clean

In [51]:
# specific column conversions

def transform_special_columns(df):
    df_transformed = df.copy()
    
    # 1. rooms_count convert to numeric
    if 'rooms_count' in df_transformed.columns:
        persian_to_number = {
            'یک': 1, 'دو': 2, 'سه': 3, 'چهار': 4,
            'پنج': 5, 'شش': 6, 'هفت': 7, 'هشت': 8,
            'نه': 9, 'ده': 10, 'بدون اتاق': 0,
            'پنج یا بیشتر': 5
        }
        
        # translate Persian words to numbers
        df_transformed['rooms_count'] = df_transformed['rooms_count'].map(persian_to_number)
        
        # convert to numeric
        df_transformed['rooms_count'] = pd.to_numeric(df_transformed['rooms_count'], errors='coerce')
    
    # 2. construction_year conversion
    if 'construction_year' in df_transformed.columns:
        def extract_year(x):
            if pd.isna(x):
                return np.nan
            x = str(x)
            # extract digits
            numbers = []
            for char in x:
                if char.isdigit():
                    numbers.append(char)
                elif char in ['۰', '۱', '۲', '۳', '۴', '۵', '۶', '۷', '۸', '۹']:
                    persian_digits = {'۰':'0', '۱':'1', '۲':'2', '۳':'3', '۴':'4',
                                    '۵':'5', '۶':'6', '۷':'7', '۸':'8', '۹':'9'}
                    numbers.append(persian_digits[char])
            
            if numbers:
                year_str = ''.join(numbers[-4:])  # 4 last digits
                if len(year_str) == 4:
                    year = int(year_str)
                    # convert Gregorian to Hijri if needed
                    if 1300 <= year <= 1500:  # hijri range
                        return year
            return np.nan
        
        df_transformed['construction_year'] = df_transformed['construction_year'].apply(extract_year)
    
    return df_transformed

In [52]:
# feature engineering for clustering

def create_clustering_features(df):
    df_features = df.copy()
    
    # main features for clustering
    
    # location-based features 
    if 'city_slug' in df_features.columns:
        # Frequency encoding for city_slug
        city_freq = df_features['city_slug'].value_counts(normalize=True)
        df_features['city_freq_encoded'] = df_features['city_slug'].map(city_freq)
    
    # physical features
    # building size and number of rooms are important
    physical_features = []
    if 'building_size' in df_features.columns:
        physical_features.append('building_size')
        # make bins for building size
        df_features['size_category'] = pd.cut(df_features['building_size'], 
                                            bins=[0, 50, 100, 150, 200, 500, np.inf],
                                            labels=['x-small', 'small', 'medium', 'large', 'x-large', 'huge'])
    
    if 'rooms_count' in df_features.columns:
        physical_features.append('rooms_count')
        # categorize number of rooms
        df_features['rooms_category'] = pd.cut(df_features['rooms_count'].fillna(0),
                                             bins=[-1, 0, 1, 2, 3, 4, np.inf],
                                             labels=['no room', 'one room', 'two rooms', 'three rooms', 'four rooms', 'five rooms+'])

    # amenities (sum of boolean amenities)
    boolean_cols = ['has_elevator', 'has_parking', 'has_warehouse', 'has_balcony',
                   'has_water', 'has_electricity', 'has_gas']
    
    existing_bools = [col for col in boolean_cols if col in df_features.columns]
    if existing_bools:
        # covert boolean strings to 1/0
        for col in existing_bools:
            df_features[col] = df_features[col].map({'true': 1, 'True': 1, True: 1,
                                                    'false': 0, 'False': 0, False: 0,
                                                    np.nan: 0})
        
        df_features['amenities_count'] = df_features[existing_bools].sum(axis=1)
    
    # d) property type (from cat2_slug)
    if 'cat2_slug' in df_features.columns:
        # One-hot encoding or label encoding
        property_type_mapping = {
            'residential-sell': 0,
            'residential-rent': 1,
            'commercial-rent': 2,
            'commercial-sell': 3,
            'temporary-rent': 4,
            'real-estate-services': 5
        }
        df_features['property_type_encoded'] = df_features['cat2_slug'].map(property_type_mapping)
    
    return df_features, physical_features + ['city_freq_encoded', 'amenities_count', 'property_type_encoded']

In [53]:
# fiture engineering for prediction
def create_prediction_features(df):
    df_features = df.copy()
    
    # 1. Target Variable definition
    # so far, we have multiple targets:
    # - (price_value)
    # - (rent_value)  
    # - (credit_value)

    # 2. important features for prediction price:
    prediction_features = []
    
    # geographical features
    if all(col in df_features.columns for col in ['location_latitude', 'location_longitude']):
        # make distance from city center
        # for tehran we assume center coordinates
        tehran_center = (35.6892, 51.3890)
        df_features['distance_from_center'] = haversine_distance(
            df_features['location_latitude'], df_features['location_longitude'],
            tehran_center[0], tehran_center[1]
        )
        prediction_features.append('distance_from_center')
    
    # physical features with interactions
    if 'building_size' in df_features.columns:
        prediction_features.append('building_size')
        
        # price per sqm (if price_value exists)
        if 'price_value' in df_features.columns:
            df_features['price_per_sqm'] = df_features['price_value'] / df_features['building_size']
            # log transform for reducing skewness
            df_features['log_price_per_sqm'] = np.log1p(df_features['price_per_sqm'])
    
    if 'rooms_count' in df_features.columns:
        prediction_features.append('rooms_count')
        
        # room density
        if 'building_size' in df_features.columns:
            df_features['room_density'] = df_features['rooms_count'] / df_features['building_size']
            prediction_features.append('room_density')
    
    # property age
    if 'construction_year' in df_features.columns:
        current_year = 1404 # current year in Hijri
        df_features['property_age'] = current_year - df_features['construction_year']
        prediction_features.append('property_age')
        
        # is new property (age <= 5 years)
        df_features['is_new_property'] = (df_features['property_age'] <= 5).astype(int)
        prediction_features.append('is_new_property')
    
    # amenities (weighted score)
    amenity_weights = {
        'has_elevator': 0.3,
        'has_parking': 0.25,
        'has_warehouse': 0.1,
        'has_balcony': 0.15,
        'has_pool': 0.35,  # luxury amenities
        'has_jacuzzi': 0.35,
        'has_sauna': 0.35
    }
    
    amenities_score = 0
    for amenity, weight in amenity_weights.items():
        if amenity in df_features.columns:
            df_features[amenity] = df_features[amenity].map({'true': 1, 'True': 1, True: 1,
                                                            'false': 0, 'False': 0, False: 0,
                                                            np.nan: 0})
            amenities_score += df_features[amenity] * weight
    
    df_features['amenities_score'] = amenities_score
    prediction_features.append('amenities_score')
    
    # user type feature
    if 'user_type' in df_features.columns:
        # agent vs individual
        df_features['is_agent'] = df_features['user_type'].apply(
            lambda x: 1 if x == 'مشاور املاک' else 0
        )
        prediction_features.append('is_agent')
    
    return df_features, prediction_features

def haversine_distance(lat1, lon1, lat2, lon2):
    """haversine distance calculation"""
    R = 6371  # earth radius in kilometers
    
    lat1, lon1, lat2, lon2 = map(np.radians, [lat1, lon1, lat2, lon2])
    
    dlat = lat2 - lat1
    dlon = lon2 - lon1
    
    a = np.sin(dlat/2)**2 + np.cos(lat1) * np.cos(lat2) * np.sin(dlon/2)**2
    c = 2 * np.arcsin(np.sqrt(a))
    
    return R * c

In [54]:
# encoding categorical features

def encode_categorical_columns(df, method='onehot'):
    """
    deferent ways for encoding categorical columns:
    - onehot: one-hot encoding for low cardinality columns
    - frequency: frequency encoding for high cardinality columns
    - target: target encoding (to be implemented with cross-validation)
    """
    df_encoded = df.copy()
    
    # detecting categorical columns
    categorical_cols = df.select_dtypes(include=['object']).columns.tolist()
    
    # dropping text columns from categorical
    text_cols = ['description', 'title']
    categorical_cols = [col for col in categorical_cols if col not in text_cols]
    
    if method == 'onehot':
        # One-hot encoding for low cardinality columns
        for col in categorical_cols:
            if df[col].nunique() <= 10:  # 10 unique values threshold
                dummies = pd.get_dummies(df[col], prefix=col, drop_first=True)
                df_encoded = pd.concat([df_encoded, dummies], axis=1)
                df_encoded = df_encoded.drop(col, axis=1)
    
    elif method == 'frequency':
        # Frequency encoding for high cardinality columns
        for col in categorical_cols:
            if df[col].nunique() > 10:
                freq = df[col].value_counts(normalize=True)
                df_encoded[f'{col}_freq'] = df[col].map(freq)
                df_encoded = df_encoded.drop(col, axis=1)
    
    elif method == 'target':
        # Target encoding (to be implemented with cross-validation)
        pass
    
    return df_encoded

In [55]:
# handling missing values
def strategic_missing_value_handling(df):
    """
    missing values management strategy:
    1. drop columns with more than 80% missing
    2. numeric columns: impute with median
    3. categorical columns: impute with mode
    """
    df_clean = df.copy()
    
    # 1. drop columns with more than 60% missing
    missing_threshold = 0.6
    high_missing_cols = df_clean.columns[df_clean.isnull().mean() > missing_threshold].tolist()
    print(f"drop columns with more than {missing_threshold*100}% missing: {len(high_missing_cols)} columns")
    df_clean = df_clean.drop(columns=high_missing_cols)
    
    # 2. numeric columns - impute with median (except target)
    numeric_cols = df_clean.select_dtypes(include=[np.number]).columns.tolist()
    target_cols = ['price_value', 'rent_value', 'credit_value']
    numeric_for_impute = [col for col in numeric_cols if col not in target_cols]
    
    for col in numeric_for_impute:
        if df_clean[col].isnull().sum() > 0:
            median_val = df_clean[col].median()
            df_clean[col] = df_clean[col].fillna(median_val)
    
    # 3. categorical columns - impute with mode
    categorical_cols = df_clean.select_dtypes(include=['object']).columns.tolist()
    for col in categorical_cols:
        if df_clean[col].isnull().sum() > 0:
            mode_val = df_clean[col].mode()[0] if not df_clean[col].mode().empty else 'Unknown'
            df_clean[col] = df_clean[col].fillna(mode_val)
    
    return df_clean

In [56]:
# preprocessing pipeline class
class RealEstatePreprocessor:
    """preprocessing pipeline for real estate data"""
    
    def __init__(self, clustering_features=None, prediction_features=None):
        self.clustering_features = clustering_features or []
        self.prediction_features = prediction_features or []
        self.scalers = {}
        self.encoders = {}
    
    def fit_transform(self, df, task='clustering'):
        """preprocessing data bases on task"""
        df_processed = df.copy()
        
        # 1. initial cleaning
        df_processed = clean_numeric_columns(df_processed)
        df_processed = transform_special_columns(df_processed)
        
        # 2. missing values management
        df_processed = strategic_missing_value_handling(df_processed)
        
        # 3. Feature Engineering
        if task == 'clustering':
            df_processed, features = create_clustering_features(df_processed)
            self.clustering_features = features
        
        elif task == 'prediction':
            df_processed, features = create_prediction_features(df_processed)
            self.prediction_features = features
            
            # for prediction، just row with available target
            target_cols = [col for col in ['price_value', 'rent_value', 'credit_value'] 
                          if col in df_processed.columns]
            if target_cols:
                # drop rows with all target missing
                mask = df_processed[target_cols].isnull().all(axis=1)
                df_processed = df_processed[~mask]
        
        # 4. Encoding
        df_processed = encode_categorical_columns(df_processed, method='frequency')
        
        # 5. Scaling (only for clustering)
        if task == 'clustering':
            from sklearn.preprocessing import StandardScaler
            scaler = StandardScaler()
            features_to_scale = [col for col in self.clustering_features if col in df_processed.columns]
            if features_to_scale:
                df_processed[features_to_scale] = scaler.fit_transform(df_processed[features_to_scale])
                self.scalers['clustering'] = scaler
        
        return df_processed

In [57]:
# drop Unnamed: 0 column
# data = data.drop('Unnamed: 0', axis=1)

# drop columns with high missing values
missing_threshold = 0.6
cols_to_drop = data.columns[data.isnull().mean() > missing_threshold]
data = data.drop(columns=cols_to_drop)

# winsorize financial columns
df_clean = data.copy()
    
# 1. outlier handling for financial columns
financial_cols = ['price_value', 'rent_value', 'credit_value']

for col in financial_cols:
    if col in df_clean.columns:
        # zero values to NaN
        zero_mask = df_clean[col] == 0
        if zero_mask.sum() > 0:
            print(f"{col}: {zero_mask.sum()} zero values -> NaN")
            df_clean.loc[zero_mask, col] = np.nan
        
        # Winsorization for outliers
        lower_limit = df_clean[col].quantile(0.01)
        upper_limit = df_clean[col].quantile(0.99)
        
        outliers_mask = (df_clean[col] < lower_limit) | (df_clean[col] > upper_limit)
        if outliers_mask.sum() > 0:
            print(f"{col}: {outliers_mask.sum()} outlier -> winsorize")
            df_clean[col] = np.clip(df_clean[col], lower_limit, upper_limit)

data = df_clean.copy()
data.shape

price_value: 1902 zero values -> NaN
price_value: 10826 outlier -> winsorize


(1000000, 23)

In [58]:
# preprocessing pipeline instantiation
## Sample data for faster processing
# sample_data = data.sample(n=100000, random_state=41)

# Preprocess for clustering
preprocessor = RealEstatePreprocessor()
clustering_data = preprocessor.fit_transform(data, task='clustering')
# drop discription & title columns if exist
clustering_data = clustering_data.drop(columns=[col for col in ['description', 'title'] if col in clustering_data.columns])
# drop rows with price_value missing
clustering_data = clustering_data[clustering_data['price_value'].notnull()]

# select clustering features
features = preprocessor.clustering_features
X_cluster = clustering_data[features]

# K-means clustering
from sklearn.cluster import KMeans
kmeans = KMeans(n_clusters=10, random_state=42, n_init=10)
clusters = kmeans.fit_predict(X_cluster)


clustering_data.shape

price_value: 5665 outlier -> winsorize
drop columns with more than 60.0% missing: 0 columns


(566444, 26)

In [59]:
# only rows with price_value
prediction_data = data[data['price_value'].notnull()].copy()

# Preprocess
preprocessor = RealEstatePreprocessor()
processed_data = preprocessor.fit_transform(prediction_data, task='prediction')

# train-test split
from sklearn.model_selection import train_test_split
X = processed_data[preprocessor.prediction_features]
y = processed_data['price_value']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

price_value: 5665 outlier -> winsorize
drop columns with more than 60.0% missing: 0 columns


In [60]:
clustering_data.info()

<class 'pandas.core.frame.DataFrame'>
Index: 566444 entries, 1 to 999997
Data columns (total 26 columns):
 #   Column                  Non-Null Count   Dtype   
---  ------                  --------------   -----   
 0   Unnamed: 0              566444 non-null  int64   
 1   cat2_slug               566444 non-null  object  
 2   price_mode              566444 non-null  object  
 3   price_value             566444 non-null  float64 
 4   building_size           566444 non-null  float64 
 5   rooms_count             566444 non-null  float64 
 6   has_balcony             566340 non-null  float64 
 7   has_elevator            566444 non-null  int64   
 8   has_warehouse           566444 non-null  int64   
 9   has_parking             566444 non-null  int64   
 10  construction_year       566444 non-null  float64 
 11  is_rebuilt              566444 non-null  bool    
 12  has_restroom            566444 non-null  object  
 13  floor_material          566444 non-null  object  
 14  location_

In [61]:
# output to csv
clustering_data.to_csv('clustering_data.csv', index=False)
