In [1]:
import pandas as pd 
import numpy as np
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import OneHotEncoder, StandardScaler, OrdinalEncoder
from sklearn.compose import ColumnTransformer
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_val_score
from sklearn.tree import DecisionTreeRegressor

df = pd.read_csv('../../prepared_data/post_feature_selection.csv')

df = df[~(df['floor_category'].isnull())]

# 0 -> unfurnished
# 1 -> semifurnished
# 2 -> furnished
df['furnishing_type'] = df['furnishing_type'].replace({0.0:'unfurnished',1.0:'semifurnished',2.0:'furnished'})

df = df[~((df['property_type']=='flat') & (df['price']==14.00))]

X = df.drop(columns=['price'])
y = df['price']

def transform_input(df):
    df = df[~(df['floor_category'].isnull())]

    
    # Create the encoder
    encoder = OneHotEncoder(sparse_output=False)

    # Assuming that 'df' is your DataFrame and 'column_to_encode' is the column you want to encode
    encoded_columns = encoder.fit_transform(df[['sector','floor_category']])

    # The result is a numpy array of encoded columns
    
    df.reset_index(drop=True, inplace=True)
    # Assuming that 'df' is your DataFrame and 'encoded_columns' is the one-hot encoded numpy array

    encoded_df = pd.DataFrame(encoded_columns, columns=encoder.get_feature_names_out())

    # Concatenate the original DataFrame and the encoded DataFrame
    df = pd.concat([df, encoded_df], axis=1)    
    
    # Check for any null values
    if df.isnull().values.any():
        print("Null values found in the DataFrame after encoding:")
        
    df =df.drop(columns=['sector','floor_category'])
    
    def encode_furnish(ftype):
        if ftype=='furnished':
            return 2.32
        elif ftype=='semifurnished':
            return 2.10
        else:
            return 1.30
    
    df['furnishing_type']=df['furnishing_type'].apply(encode_furnish)
    
    def encode_luxury(ltype):
        if ltype=='High':
            return 1.95
        elif ltype=='Medium':
            return 1.535
        else:
            return 1.325
    df['luxury_category']=df['luxury_category'].apply(encode_luxury)
    
    def encode_ap(atype):
        if atype=='Moderately Old':
            return 1.85
        elif atype=='New Property':
            return 1.35
        elif atype=='Old Property':
            return 2.20
        elif atype=='Relatively New':
            return 1.45
        else:
            return 1.33
    df['agePossession']=df['agePossession'].apply(encode_ap)
    
    def encode_ptype(ptype):
        if ptype=='flat':
            return 1.38
        else:
            return 4
    df['property_type'] = df['property_type'].apply(encode_ptype)
    
    return df

X = transform_input(X)
    

# Applying the log1p transformation to the target variable
y_transformed = np.log1p(y)

X_train, X_test, y_train, y_test = train_test_split(X,y , 
                                   random_state=104,  
                                   test_size=0.30,  
                                   shuffle=True) 

dtree = DecisionTreeRegressor(criterion='absolute_error', splitter='best', max_depth=10, min_samples_split=10, min_samples_leaf=4, min_weight_fraction_leaf=0.0, max_features=None, random_state=None, max_leaf_nodes=None, min_impurity_decrease=0.0, ccp_alpha=0.0, monotonic_cst=None)



  from pandas.core import (


In [16]:
from sklearn.metrics import mean_absolute_error
import pandas as pd
import numpy as np
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import OneHotEncoder, StandardScaler, OrdinalEncoder
from sklearn.compose import ColumnTransformer
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor, AdaBoostRegressor, ExtraTreesRegressor
from sklearn.model_selection import KFold, cross_val_score, train_test_split
from sklearn.metrics import mean_absolute_error
from sklearn.linear_model import LinearRegression, Ridge, Lasso
from sklearn.svm import SVR
from sklearn.tree import DecisionTreeRegressor, ExtraTreeRegressor
from xgboost import XGBRegressor  # Assuming you have XGBoost installed

model_dict = {
    'linear_reg':LinearRegression(),
    'svr':SVR(),
    'ridge':Ridge(),
    'LASSO':Lasso(),
    'decision tree': DecisionTreeRegressor(),
    'random forest':RandomForestRegressor(),
    'extra trees': ExtraTreesRegressor(),
    'gradient boosting': GradientBoostingRegressor(),
    'adaboost': AdaBoostRegressor(),
    'xgboost':XGBRegressor()
}

def scorer(model_name, model):
    
    output = []
    
    output.append(model_name)

    
    X_train, X_test, y_train, y_test = train_test_split(X,y_transformed,test_size=0.2,random_state=42)
    
    model.fit(X_train,y_train)
    
    scores = model.score(X_test,y_test)
    
    output.append(scores)
    
    y_pred = model.predict(X_test)
    
    y_pred = np.expm1(y_pred)
    
    output.append(mean_absolute_error(np.expm1(y_test),y_pred))
    
    return output

model_output = []
for model_name,model in model_dict.items():
    model_output.append(scorer(model_name, model))
    
model_df = pd.DataFrame(model_output, columns=['name','r2','mae'])

model_df.sort_values(['mae'])

Unnamed: 0,name,r2,mae
9,xgboost,0.86708,0.570838
6,extra trees,0.834917,0.573603
5,random forest,0.844041,0.594236
0,linear_reg,0.823337,0.680731
2,ridge,0.823975,0.68171
7,gradient boosting,0.810202,0.714473
4,decision tree,0.759744,0.735138
8,adaboost,0.610839,1.044144
1,svr,0.472477,1.075049
3,LASSO,0.433861,1.207928


In [2]:
dtree.fit(X_train,y_train)

In [3]:
dtree.score(X_train,y_train)

0.7738666770634407

In [4]:
dtree.score(X_test,y_test)

0.6227318124731798

In [11]:
from sklearn.ensemble import GradientBoostingRegressor

gbr = GradientBoostingRegressor(random_state=0)

gbr.fit(X_train,y_train)

gbr.score(X_train,y_train)

0.8659198364102894

In [12]:
gbr.score(X_test,y_test)

0.7524092930879209

In [13]:
y_pred_gbr = gbr.predict(X_test)
from sklearn.metrics import mean_absolute_error


0.7216352512893671