In [1]:
import numpy as np
import pandas as pd
import lightgbm as lgb
from sklearn import preprocessing
from sklearn import model_selection
from sklearn import metrics
import warnings

warnings.filterwarnings("ignore")

In [2]:
!kaggle datasets download -d anmolkumar/house-price-prediction-challenge
!mkdir input/house-price-prediction-challenge
!unzip house-price-prediction-challenge.zip -d input/house-price-prediction-challenge
!rm house-price-prediction-challenge.zip

house-price-prediction-challenge.zip: Skipping, found more recently modified local copy (use --force to force download)
mkdir: input/house-price-prediction-challenge: File exists
Archive:  house-price-prediction-challenge.zip
  inflating: input/house-price-prediction-challenge/sample_submission.csv  
  inflating: input/house-price-prediction-challenge/test.csv  
  inflating: input/house-price-prediction-challenge/train.csv  


In [3]:
train = pd.read_csv("input/house-price-prediction-challenge/train.csv")
train.columns = [col.lower() for col in train.columns]

In [4]:
train

Unnamed: 0,posted_by,under_construction,rera,bhk_no.,bhk_or_rk,square_ft,ready_to_move,resale,address,longitude,latitude,target(price_in_lacs)
0,Owner,0,0,2,BHK,1300.236407,1,1,"Ksfc Layout,Bangalore",12.969910,77.597960,55.0
1,Dealer,0,0,2,BHK,1275.000000,1,1,"Vishweshwara Nagar,Mysore",12.274538,76.644605,51.0
2,Owner,0,0,2,BHK,933.159722,1,1,"Jigani,Bangalore",12.778033,77.632191,43.0
3,Owner,0,1,2,BHK,929.921143,1,1,"Sector-1 Vaishali,Ghaziabad",28.642300,77.344500,62.5
4,Dealer,1,0,2,BHK,999.009247,0,1,"New Town,Kolkata",22.592200,88.484911,60.5
...,...,...,...,...,...,...,...,...,...,...,...,...
29446,Owner,0,0,3,BHK,2500.000000,1,1,"Shamshabad Road,Agra",27.140626,78.043277,45.0
29447,Owner,0,0,2,BHK,769.230769,1,1,"E3-108, Lake View Recidency,,Vapi",39.945409,-86.150721,16.0
29448,Dealer,0,0,2,BHK,1022.641509,1,1,"Ajmer Road,Jaipur",26.928785,75.828002,27.1
29449,Owner,0,0,2,BHK,927.079009,1,1,"Sholinganallur,Chennai",12.900150,80.227910,67.0


In [5]:
numeric_cols = ["under_construction", "rera", "bhk_no.", "square_ft", "ready_to_move", "resale", "longitude", "latitude"]
categoric_cols = ["posted_by", "bhk_or_rk", ]
input_cols = numeric_cols + categoric_cols
target = "target(price_in_lacs)"

In [6]:
cat_encoder = preprocessing.OrdinalEncoder()
train.loc[:, categoric_cols] = cat_encoder.fit_transform(train.loc[:, categoric_cols])

***
## Model validation through cross validation

In [7]:
def cast_to_lgb_dset(X, y, categorical_cols):
    lgb_dset = lgb.Dataset(
        data = X,
        label = y,
        categorical_feature = categorical_cols,
        free_raw_data = False,
    )
    return lgb_dset

model_params = {
    "objective": "regression",
    "metric":"rmse",
    "boosting": "gbdt",
    "learning_rate": 0.05,
    "num_leaves": 31,
    "max_depth": -1,
    "min_data_in_leaf": 20,
    "bagging_fraction": 0.8,
    "feature_fraction": 0.9,
    "seed": 42,
    "deterministic": True,
    "verbose": -1,
}

In [8]:
kf = model_selection.KFold(n_splits=5, shuffle=True, random_state=23)

In [9]:
cv_scores = list()
best_iterations = list()

for train_idx,valid_idx in kf.split(train.loc[:,input_cols]):
    
    X_train,y_train = train.loc[train_idx,input_cols], train.loc[train_idx,target]
    X_valid,y_valid = train.loc[valid_idx,input_cols], train.loc[valid_idx,target]

    train_dset = cast_to_lgb_dset(X_train, y_train, categoric_cols)
    valid_dset = cast_to_lgb_dset(X_valid, y_valid, categoric_cols)
    
    _model = lgb.train(
    train_set = train_dset,
    valid_sets = [valid_dset, ],
    num_boost_round = 10000,
    early_stopping_rounds = 50,
    params = model_params, 
    verbose_eval=25,
    )
    
    cv_scores.append(_model.best_score["valid_0"]["rmse"])
    best_iterations.append(_model.best_iteration)

Training until validation scores don't improve for 50 rounds
[25]	valid_0's rmse: 418.529
[50]	valid_0's rmse: 345.739
[75]	valid_0's rmse: 331.983
[100]	valid_0's rmse: 324.346
[125]	valid_0's rmse: 319.746
[150]	valid_0's rmse: 310.105
[175]	valid_0's rmse: 303.237
[200]	valid_0's rmse: 298.945
[225]	valid_0's rmse: 295.839
[250]	valid_0's rmse: 291.221
[275]	valid_0's rmse: 289.17
[300]	valid_0's rmse: 286.647
[325]	valid_0's rmse: 283.051
[350]	valid_0's rmse: 281.978
[375]	valid_0's rmse: 280.42
[400]	valid_0's rmse: 280.306
[425]	valid_0's rmse: 280.123
[450]	valid_0's rmse: 278.337
[475]	valid_0's rmse: 277.578
[500]	valid_0's rmse: 276.707
[525]	valid_0's rmse: 274.979
[550]	valid_0's rmse: 273.422
[575]	valid_0's rmse: 272.815
[600]	valid_0's rmse: 272.152
[625]	valid_0's rmse: 271.452
[650]	valid_0's rmse: 271.412
[675]	valid_0's rmse: 271.78
[700]	valid_0's rmse: 272.14
Early stopping, best iteration is:
[653]	valid_0's rmse: 271.323
Training until validation scores don't im

In [10]:
print(f"CV RMSE: {np.mean(cv_scores):0.5f}")
print(f"Best iterations by fold: {best_iterations}")

CV RMSE: 250.98540
Best iterations by fold: [653, 157, 75, 62, 2751]


***
### Nested cross validation

In [11]:
kf_outer = model_selection.KFold(n_splits=5, shuffle=True, random_state=23)

In [12]:
cv_scores = list()

for train_idx,valid_idx in kf_outer.split(train.loc[:,input_cols]):
        
    X_train,y_train = train.loc[train_idx,input_cols], train.loc[train_idx,target]
    X_valid,y_valid = train.loc[valid_idx,input_cols], train.loc[valid_idx,target]
        
    best_iterations = list()
    kf_inner = model_selection.KFold(n_splits=5, shuffle=True, random_state=2)
    
    for train_idx_inner,valid_idx_inner in kf_inner.split(X_train, y_train):
        X_train_inner,y_train_inner = X_train.iloc[train_idx_inner,:], y_train.iloc[train_idx_inner]
        X_valid_inner,y_valid_inner = X_train.iloc[valid_idx_inner,:], y_train.iloc[valid_idx_inner]
        
        train_dset = cast_to_lgb_dset(X_train_inner, y_train_inner, categoric_cols)
        valid_dset = cast_to_lgb_dset(X_valid_inner, y_valid_inner, categoric_cols)

        _model = lgb.train(
            train_set = train_dset,
            valid_sets = [valid_dset, ],
            num_boost_round = 10000,
            early_stopping_rounds = 50,
            params = model_params, 
            verbose_eval=25,
        )
        best_iterations.append(_model.best_iteration)
        
    best_iteration = np.median(best_iterations)
    
    # trains a model over outer split using best iteration obtained from inner loop
    # (without using early stopping to avoid leakage)

    train_dset = cast_to_lgb_dset(X_train, y_train, categoric_cols)
    
    model = lgb.train(
        train_set = train_dset,
        num_boost_round = int(best_iteration),
        params = model_params)
    preds = model.predict(X_valid)
    rmse = np.sqrt(metrics.mean_squared_error(y_valid, preds))
    
    cv_scores.append(rmse)

Training until validation scores don't improve for 50 rounds
[25]	valid_0's rmse: 281.993
[50]	valid_0's rmse: 217.892
[75]	valid_0's rmse: 214.553
[100]	valid_0's rmse: 220.666
Early stopping, best iteration is:
[73]	valid_0's rmse: 214.411
Training until validation scores don't improve for 50 rounds
[25]	valid_0's rmse: 269.852
[50]	valid_0's rmse: 207.478
[75]	valid_0's rmse: 204.989
[100]	valid_0's rmse: 202.445
[125]	valid_0's rmse: 204.401
[150]	valid_0's rmse: 205.427
Early stopping, best iteration is:
[114]	valid_0's rmse: 201.962
Training until validation scores don't improve for 50 rounds
[25]	valid_0's rmse: 637.742
[50]	valid_0's rmse: 568.366
[75]	valid_0's rmse: 551.352
[100]	valid_0's rmse: 544.25
[125]	valid_0's rmse: 541.984
[150]	valid_0's rmse: 543.074
[175]	valid_0's rmse: 542.378
Early stopping, best iteration is:
[130]	valid_0's rmse: 541.839
Training until validation scores don't improve for 50 rounds
[25]	valid_0's rmse: 257.213
[50]	valid_0's rmse: 166.683
[75]

In [13]:
print(f"CV RMSE: {np.mean(cv_scores):0.5f}")

CV RMSE: 284.08901


***
### Repeated cross validation

First make a CV split to find a good value for the number of boosting rounds, then validate your model on a different CV split

In [14]:
kf1 = model_selection.KFold(n_splits=5, shuffle=True, random_state=123)

In [15]:
best_iterations = list()

for train_idx,valid_idx in kf1.split(train.loc[:,input_cols]):
    
    X_train,y_train = train.loc[train_idx,input_cols], train.loc[train_idx,target]
    X_valid,y_valid = train.loc[valid_idx,input_cols], train.loc[valid_idx,target]

    train_dset = cast_to_lgb_dset(X_train, y_train, categoric_cols)
    valid_dset = cast_to_lgb_dset(X_valid, y_valid, categoric_cols)
    
    _model = lgb.train(
    train_set = train_dset,
    valid_sets = [valid_dset, ],
    num_boost_round = 10000,
    early_stopping_rounds = 50,
    params = model_params, 
    verbose_eval=25,
    )
    
    best_iterations.append(_model.best_iteration)

Training until validation scores don't improve for 50 rounds
[25]	valid_0's rmse: 440.434
[50]	valid_0's rmse: 360.037
[75]	valid_0's rmse: 342.679
[100]	valid_0's rmse: 335.615
[125]	valid_0's rmse: 330.739
[150]	valid_0's rmse: 320.228
[175]	valid_0's rmse: 315.033
[200]	valid_0's rmse: 313.683
[225]	valid_0's rmse: 309.243
[250]	valid_0's rmse: 310.531
[275]	valid_0's rmse: 308.089
[300]	valid_0's rmse: 307.59
[325]	valid_0's rmse: 306.676
[350]	valid_0's rmse: 306.539
[375]	valid_0's rmse: 306.518
[400]	valid_0's rmse: 306.326
[425]	valid_0's rmse: 306.97
Early stopping, best iteration is:
[387]	valid_0's rmse: 305.944
Training until validation scores don't improve for 50 rounds
[25]	valid_0's rmse: 234.352
[50]	valid_0's rmse: 171.049
[75]	valid_0's rmse: 163.502
[100]	valid_0's rmse: 165.121
[125]	valid_0's rmse: 167.353
Early stopping, best iteration is:
[76]	valid_0's rmse: 163.4
Training until validation scores don't improve for 50 rounds
[25]	valid_0's rmse: 366.373
[50]	vali

In [16]:
best_iteration = np.median(best_iterations)
kf2 = model_selection.KFold(n_splits=5, shuffle=True, random_state=23)

cv_scores = list()

for train_idx,valid_idx in kf2.split(train.loc[:,input_cols]):
    
    X_train,y_train = train.loc[train_idx,input_cols], train.loc[train_idx,target]
    X_valid,y_valid = train.loc[valid_idx,input_cols], train.loc[valid_idx,target]

    train_dset = cast_to_lgb_dset(X_train, y_train, categoric_cols)
    
    model = lgb.train(
        train_set = train_dset,
        num_boost_round = int(best_iteration),
        params = model_params)
    
    preds = model.predict(X_valid)
    rmse = np.sqrt(metrics.mean_squared_error(y_valid, preds))
    cv_scores.append(rmse)  

In [17]:
print(f"CV RMSE: {np.mean(cv_scores):0.5f}")

CV RMSE: 270.39060


***