In [1]:
import pandas as pd 
import numpy as np
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import OneHotEncoder, StandardScaler, OrdinalEncoder
from sklearn.compose import ColumnTransformer
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_val_score
from sklearn.tree import DecisionTreeRegressor

df = pd.read_csv('../../prepared_data/post_feature_selection.csv')

df = df[~(df['floor_category'].isnull())]

# 0 -> unfurnished
# 1 -> semifurnished
# 2 -> furnished
df['furnishing_type'] = df['furnishing_type'].replace({0.0:'unfurnished',1.0:'semifurnished',2.0:'furnished'})

df = df[~((df['property_type']=='flat') & (df['price']==14.00))]

X = df.drop(columns=['price'])
y = df['price']

def transform_input(df):
    df = df[~(df['floor_category'].isnull())]

    
    # Create the encoder
    encoder = OneHotEncoder(sparse_output=False)

    # Assuming that 'df' is your DataFrame and 'column_to_encode' is the column you want to encode
    encoded_columns = encoder.fit_transform(df[['sector','floor_category']])

    # The result is a numpy array of encoded columns
    
    df.reset_index(drop=True, inplace=True)
    # Assuming that 'df' is your DataFrame and 'encoded_columns' is the one-hot encoded numpy array

    encoded_df = pd.DataFrame(encoded_columns, columns=encoder.get_feature_names_out())

    # Concatenate the original DataFrame and the encoded DataFrame
    df = pd.concat([df, encoded_df], axis=1)    
    
    # Check for any null values
    if df.isnull().values.any():
        print("Null values found in the DataFrame after encoding:")
        
    df =df.drop(columns=['sector','floor_category'])
    
    def encode_furnish(ftype):
        if ftype=='furnished':
            return 2.32
        elif ftype=='semifurnished':
            return 2.10
        else:
            return 1.30
    
    df['furnishing_type']=df['furnishing_type'].apply(encode_furnish)
    
    def encode_luxury(ltype):
        if ltype=='High':
            return 1.95
        elif ltype=='Medium':
            return 1.535
        else:
            return 1.325
    df['luxury_category']=df['luxury_category'].apply(encode_luxury)
    
    def encode_ap(atype):
        if atype=='Moderately Old':
            return 1.85
        elif atype=='New Property':
            return 1.35
        elif atype=='Old Property':
            return 2.20
        elif atype=='Relatively New':
            return 1.45
        else:
            return 1.33
    df['agePossession']=df['agePossession'].apply(encode_ap)
    
    def encode_ptype(ptype):
        if ptype=='flat':
            return 1.38
        else:
            return 4
    df['property_type'] = df['property_type'].apply(encode_ptype)
    
    return df

X = transform_input(X)
    

# Applying the log1p transformation to the target variable
y_transformed = np.log1p(y)

X_train, X_test, y_train, y_test = train_test_split(X,y , 
                                   random_state=104,  
                                   test_size=0.30,  
                                   shuffle=True) 

dtree = DecisionTreeRegressor(criterion='absolute_error', splitter='best', max_depth=10, min_samples_split=10, min_samples_leaf=4, min_weight_fraction_leaf=0.0, max_features=None, random_state=None, max_leaf_nodes=None, min_impurity_decrease=0.0, ccp_alpha=0.0, monotonic_cst=None)



  from pandas.core import (


In [16]:
from sklearn.metrics import mean_absolute_error
import pandas as pd
import numpy as np
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import OneHotEncoder, StandardScaler, OrdinalEncoder
from sklearn.compose import ColumnTransformer
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor, AdaBoostRegressor, ExtraTreesRegressor
from sklearn.model_selection import KFold, cross_val_score, train_test_split
from sklearn.metrics import mean_absolute_error
from sklearn.linear_model import LinearRegression, Ridge, Lasso
from sklearn.svm import SVR
from sklearn.tree import DecisionTreeRegressor, ExtraTreeRegressor
from xgboost import XGBRegressor  # Assuming you have XGBoost installed

model_dict = {
    'linear_reg':LinearRegression(),
    'svr':SVR(),
    'ridge':Ridge(),
    'LASSO':Lasso(),
    'decision tree': DecisionTreeRegressor(),
    'random forest':RandomForestRegressor(),
    'extra trees': ExtraTreesRegressor(),
    'gradient boosting': GradientBoostingRegressor(),
    'adaboost': AdaBoostRegressor(),
    'xgboost':XGBRegressor()
}

def scorer(model_name, model):
    
    output = []
    
    output.append(model_name)

    
    X_train, X_test, y_train, y_test = train_test_split(X,y_transformed,test_size=0.2,random_state=42)
    
    model.fit(X_train,y_train)
    
    scores = model.score(X_test,y_test)
    
    output.append(scores)
    
    y_pred = model.predict(X_test)
    
    y_pred = np.expm1(y_pred)
    
    output.append(mean_absolute_error(np.expm1(y_test),y_pred))
    
    return output

model_output = []
for model_name,model in model_dict.items():
    model_output.append(scorer(model_name, model))
    
model_df = pd.DataFrame(model_output, columns=['name','r2','mae'])

model_df.sort_values(['mae'])

Unnamed: 0,name,r2,mae
9,xgboost,0.86708,0.570838
6,extra trees,0.834917,0.573603
5,random forest,0.844041,0.594236
0,linear_reg,0.823337,0.680731
2,ridge,0.823975,0.68171
7,gradient boosting,0.810202,0.714473
4,decision tree,0.759744,0.735138
8,adaboost,0.610839,1.044144
1,svr,0.472477,1.075049
3,LASSO,0.433861,1.207928


In [17]:
X.head()

Unnamed: 0,property_type,bedRoom,bathroom,balcony,agePossession,built_up_area,servant room,store room,furnishing_type,luxury_category,...,sector_sector 93,sector_sector 95,sector_sector 99,sector_sector 99a,sector_sector 9a,sector_sohna road,sector_sohna road road,floor_category_High Floor,floor_category_Low Floor,floor_category_Mid Floor
0,1.38,4,4,3,1.33,2477.0,0,0,1.3,1.325,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
1,1.38,3,3,3,1.33,1403.0,0,0,1.3,1.535,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
2,1.38,2,2,2,1.33,103.0,0,0,1.3,1.325,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
3,4.0,1,1,0,1.85,3800.0,0,0,1.3,1.325,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
4,1.38,3,4,4,1.45,1186.0,1,0,2.1,1.95,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0


In [18]:
df.head()

Unnamed: 0,property_type,sector,price,bedRoom,bathroom,balcony,agePossession,built_up_area,servant room,store room,furnishing_type,luxury_category,floor_category
0,flat,sector 53,7.4,4,4,3,Under Construction,2477.0,0,0,unfurnished,Low,High Floor
1,flat,sector 33,1.5,3,3,3,Under Construction,1403.0,0,0,unfurnished,Medium,Mid Floor
2,flat,sector 33,1.15,2,2,2,Under Construction,103.0,0,0,unfurnished,Low,High Floor
3,house,sector 3,3.5,1,1,0,Moderately Old,3800.0,0,0,unfurnished,Low,Low Floor
4,flat,sector 106,1.15,3,4,4,Relatively New,1186.0,1,0,semifurnished,High,High Floor


In [20]:
import pickle
with open('../../models/xgb_model.pkl','rb') as file:
    xgb_model = pickle.load(file)

configuration generated by an older version of XGBoost, please export the model by calling
`Booster.save_model` from that version first, then load it back in current version. See:

    https://xgboost.readthedocs.io/en/stable/tutorials/saving_model.html

for more details about differences between saving model and serializing.



In [22]:
inp = pd.read_csv('../inp.csv')

AttributeError: 'numpy.ndarray' object has no attribute 'columns'

In [44]:
linp=['flat', 'dwarka expressway', 1, 1, 0, 'Moderately Old', 0, 0, 0,
       'furnished', 'High', 'High Floor']

In [45]:
linp = transform_input(inp)

IndexError: only integers, slices (`:`), ellipsis (`...`), numpy.newaxis (`None`) and integer or boolean arrays are valid indices

In [42]:
xgb_model.predict(linp)

ValueError: feature_names mismatch: ['property_type', 'bedRoom', 'bathroom', 'balcony', 'agePossession', 'built_up_area', 'servant room', 'store room', 'furnishing_type', 'luxury_category', 'sector_dwarka expressway', 'sector_gwal pahari', 'sector_manesar', 'sector_new', 'sector_new sector 2', 'sector_sector 1', 'sector_sector 102', 'sector_sector 103', 'sector_sector 104', 'sector_sector 105', 'sector_sector 106', 'sector_sector 107', 'sector_sector 108', 'sector_sector 109', 'sector_sector 10a', 'sector_sector 11', 'sector_sector 110', 'sector_sector 111', 'sector_sector 112', 'sector_sector 113', 'sector_sector 12', 'sector_sector 13', 'sector_sector 14', 'sector_sector 15', 'sector_sector 17', 'sector_sector 17a', 'sector_sector 17b', 'sector_sector 2', 'sector_sector 21', 'sector_sector 22', 'sector_sector 23', 'sector_sector 24', 'sector_sector 25', 'sector_sector 26', 'sector_sector 27', 'sector_sector 28', 'sector_sector 3', 'sector_sector 3 phase 2', 'sector_sector 3 phase 3 extension', 'sector_sector 30', 'sector_sector 31', 'sector_sector 33', 'sector_sector 36', 'sector_sector 36a', 'sector_sector 37', 'sector_sector 37c', 'sector_sector 37d', 'sector_sector 38', 'sector_sector 39', 'sector_sector 4', 'sector_sector 40', 'sector_sector 41', 'sector_sector 43', 'sector_sector 45', 'sector_sector 46', 'sector_sector 47', 'sector_sector 48', 'sector_sector 49', 'sector_sector 5', 'sector_sector 50', 'sector_sector 51', 'sector_sector 52', 'sector_sector 53', 'sector_sector 54', 'sector_sector 55', 'sector_sector 56', 'sector_sector 57', 'sector_sector 58', 'sector_sector 59', 'sector_sector 6', 'sector_sector 60', 'sector_sector 61', 'sector_sector 62', 'sector_sector 63', 'sector_sector 63a', 'sector_sector 65', 'sector_sector 66', 'sector_sector 67', 'sector_sector 67a', 'sector_sector 68', 'sector_sector 69', 'sector_sector 7', 'sector_sector 70', 'sector_sector 70a', 'sector_sector 71', 'sector_sector 72', 'sector_sector 73', 'sector_sector 74', 'sector_sector 76', 'sector_sector 77', 'sector_sector 78', 'sector_sector 79', 'sector_sector 8', 'sector_sector 80', 'sector_sector 81', 'sector_sector 82', 'sector_sector 82a', 'sector_sector 83', 'sector_sector 84', 'sector_sector 85', 'sector_sector 86', 'sector_sector 88a', 'sector_sector 88b', 'sector_sector 89', 'sector_sector 9', 'sector_sector 90', 'sector_sector 91', 'sector_sector 92', 'sector_sector 93', 'sector_sector 95', 'sector_sector 99', 'sector_sector 99a', 'sector_sector 9a', 'sector_sohna road', 'sector_sohna road road', 'floor_category_High Floor', 'floor_category_Low Floor', 'floor_category_Mid Floor'] ['Unnamed: 0', 'property_type', 'bedRoom', 'bathroom', 'balcony', 'agePossession', 'built_up_area', 'servant room', 'store room', 'furnishing_type', 'luxury_category', 'sector_dwarka expressway', 'floor_category_High Floor']
expected sector_sector 80, sector_sector 74, sector_sector 48, sector_sector 5, sector_sector 23, sector_sector 111, sector_sector 65, sector_sector 53, sector_sector 69, sector_sector 41, sector_sector 60, sector_sector 67, sector_sector 86, sector_sector 7, sector_sector 93, sector_sector 68, sector_sector 108, sector_sector 10a, sector_sector 103, sector_sector 89, sector_sector 46, sector_sector 113, sector_sector 70a, sector_sector 95, sector_manesar, sector_sector 39, sector_sector 59, sector_sector 12, sector_sector 31, sector_sector 73, sector_sector 43, sector_sector 38, sector_sector 27, sector_sector 26, sector_sector 54, sector_sector 63a, sector_sector 67a, floor_category_Mid Floor, sector_sector 58, sector_sector 110, sector_sector 28, sector_sector 6, sector_sector 71, sector_sector 104, sector_gwal pahari, sector_sector 56, sector_sector 88b, sector_sector 112, sector_sector 45, sector_sector 50, sector_sector 109, sector_sector 99, sector_sohna road road, sector_sector 81, sector_sector 78, sector_sector 55, sector_sector 33, sector_sector 37c, sector_sector 47, sector_sector 51, sector_sector 76, sector_sector 83, sector_sector 36a, sector_new sector 2, sector_sector 24, sector_sector 14, sector_sector 37, sector_sector 85, sector_sector 82, floor_category_Low Floor, sector_sector 92, sector_sector 70, sector_sector 91, sector_sector 11, sector_sector 107, sector_sector 8, sector_sector 106, sector_sector 88a, sector_sector 9, sector_sector 84, sector_sector 63, sector_sector 36, sector_sector 77, sector_sector 3 phase 3 extension, sector_sector 102, sector_sector 79, sector_sector 15, sector_sector 62, sector_sector 72, sector_sector 4, sector_sector 99a, sector_sohna road, sector_sector 3 phase 2, sector_sector 66, sector_new, sector_sector 61, sector_sector 21, sector_sector 17b, sector_sector 105, sector_sector 37d, sector_sector 90, sector_sector 17a, sector_sector 52, sector_sector 22, sector_sector 2, sector_sector 40, sector_sector 30, sector_sector 13, sector_sector 17, sector_sector 25, sector_sector 57, sector_sector 82a, sector_sector 1, sector_sector 9a, sector_sector 3, sector_sector 49 in input data
training data did not have the following fields: Unnamed: 0

In [2]:
dtree.fit(X_train,y_train)

In [3]:
dtree.score(X_train,y_train)

0.7738666770634407

In [4]:
dtree.score(X_test,y_test)

0.6227318124731798

In [11]:
from sklearn.ensemble import GradientBoostingRegressor

gbr = GradientBoostingRegressor(random_state=0)

gbr.fit(X_train,y_train)

gbr.score(X_train,y_train)

0.8659198364102894

In [12]:
gbr.score(X_test,y_test)

0.7524092930879209

In [13]:
y_pred_gbr = gbr.predict(X_test)
from sklearn.metrics import mean_absolute_error


0.7216352512893671