In [53]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.preprocessing import OneHotEncoder
import category_encoders as ce
from sklearn.preprocessing import StandardScaler

In [54]:
#reading df
df = pd.read_csv('prepare_df.csv',index_col='id')

### Data Splitting

In [55]:
train_X = df.drop(columns=['price','price_log'], axis=1)
train_y = df['price_log']

### Dividing and Selecting_features

In [56]:
amenity_col = list(df.filter(like='amenity').columns)
cat_cols = list(set(df.select_dtypes(include=['object']).columns) - set(['city', 'locality']))
num_cols = list(set(df.select_dtypes(include='number').columns) - set(['price', 'price_log']) - set(amenity_col))
large_cat = ['zone']
small_cat = list(set(cat_cols) - set(large_cat))
number_cols = list(set(num_cols) - set(['city', 'locality']))


In [57]:
small_cat, large_cat, number_cols

(['building_nature', 'building_type', 'division', 'purpose'],
 ['zone'],
 ['area', 'num_bath_rooms', 'num_bed_rooms'])

### Custom Transformers

In [58]:
from sklearn.base import BaseEstimator, TransformerMixin

class CatBoostEncoderTransformer(BaseEstimator, TransformerMixin):
    def __init__(self, columns=None):
        self.columns = columns
        self.encoders = {}
    
    def fit(self, X, y=None):
        for col in self.columns:
            encoder = ce.CatBoostEncoder()
            encoder.fit(X[col], y)
            self.encoders[col] = encoder
        
        return self
    
    def transform(self, X):
        transformed_X = X.copy()
        
        for col, encoder in self.encoders.items():
            transformed_X[col] = encoder.transform(X[col])
        
        return transformed_X

class OneHotEncoderTransformer(BaseEstimator, TransformerMixin):
    def __init__(self, columns=None, drop_original=True):
        self.columns = columns
        self.drop_original = drop_original
        self.encoder = OneHotEncoder(sparse=False, handle_unknown='ignore')
        self.new_columns = None
    
    def fit(self, X, y=None):
        if self.columns is None:
            global small_cat
            self.columns = small_cat.tolist()
        
        self.encoder.fit(X[self.columns])
        self.new_columns = self.encoder.get_feature_names_out(self.columns)
        
        return self
    
    def transform(self, X):
        transformed_X = pd.DataFrame(self.encoder.transform(X[self.columns]), columns=self.new_columns, index=X.index)
        
        if self.drop_original:
            transformed_X = X.drop(columns=self.columns).join(transformed_X)
        
        return transformed_X
    
class NumberColsStandardScaler(BaseEstimator, TransformerMixin):
    def __init__(self, number_cols=None):
        self.number_cols = number_cols
        self.scaler = StandardScaler()
    
    def fit(self, X, y=None):
        if self.number_cols is None:
            self.number_cols = X.select_dtypes(include='number').columns.tolist()
        
        self.scaler.fit(X[self.number_cols])
        
        return self
    
    def transform(self, X):
        transformed_X = X.copy()
        transformed_X[self.number_cols] = self.scaler.transform(X[self.number_cols])
        
        return transformed_X
        
class PassAmenityColumns(BaseEstimator, TransformerMixin):
    def __init__(self):
        self
    def fit(self, X=None, y=None):
        return self
    def transform(self, df):
        return df

### Columns Transformer

In [59]:
from sklearn.compose import ColumnTransformer

large_cat_transformer = CatBoostEncoderTransformer(columns=large_cat)
Transformed_large_cat = large_cat_transformer.fit_transform(train_X[large_cat], train_y)
number_cols.append('zone')
train_X['zone'] = Transformed_large_cat['zone']
small_cat_transformer = OneHotEncoderTransformer(columns=small_cat,drop_original=True)
scaler_transformer = NumberColsStandardScaler(number_cols=number_cols)

preprocessor = ColumnTransformer(transformers=[ 
    ('small_cat', small_cat_transformer, small_cat),
    ('scaling', scaler_transformer, number_cols) ,
    ('pass_amenity_cols', PassAmenityColumns(), amenity_col)
])

In [60]:
from sklearn.pipeline import Pipeline
full_pipeline = Pipeline(steps=[
    ('preprocessing', preprocessor),
])

In [61]:
prepared_train_x = preprocessor.fit_transform(train_X,train_y)

In [62]:

def prepare_input_for_model(x):
    x_large_preapered = large_cat_transformer.transform(x)
    x['zone'] = x_large_preapered['zone']
    x = preprocessor.transform(x)
    return x

# Model Development


## RandomForestRegressor

In [68]:
from sklearn.ensemble import RandomForestRegressor
rf_reg = RandomForestRegressor(max_features=8, n_estimators=15)
rf_reg.fit(prepared_train_x, train_y)

RandomForestRegressor(max_features=8, n_estimators=15)