In [111]:
import kagglehub
import pandas as pd
import os

path = kagglehub.dataset_download("iamsouravbanerjee/house-rent-prediction-dataset")

print("Path to dataset files:", path)

print("Files in dataset folder:", os.listdir(path))

csv_path = os.path.join(path, 'House_Rent_Dataset.csv')

df = pd.read_csv(csv_path)
df.head()

Path to dataset files: /root/.cache/kagglehub/datasets/iamsouravbanerjee/house-rent-prediction-dataset/versions/9
Files in dataset folder: ['Dataset Glossary.txt', 'House_Rent_Dataset.csv']


Unnamed: 0,Posted On,BHK,Rent,Size,Floor,Area Type,Area Locality,City,Furnishing Status,Tenant Preferred,Bathroom,Point of Contact
0,2022-05-18,2,10000,1100,Ground out of 2,Super Area,Bandel,Kolkata,Unfurnished,Bachelors/Family,2,Contact Owner
1,2022-05-13,2,20000,800,1 out of 3,Super Area,"Phool Bagan, Kankurgachi",Kolkata,Semi-Furnished,Bachelors/Family,1,Contact Owner
2,2022-05-16,2,17000,1000,1 out of 3,Super Area,Salt Lake City Sector 2,Kolkata,Semi-Furnished,Bachelors/Family,1,Contact Owner
3,2022-07-04,2,10000,800,1 out of 2,Super Area,Dumdum Park,Kolkata,Unfurnished,Bachelors/Family,1,Contact Owner
4,2022-05-09,2,7500,850,1 out of 2,Carpet Area,South Dum Dum,Kolkata,Unfurnished,Bachelors,1,Contact Owner


In [112]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4746 entries, 0 to 4745
Data columns (total 12 columns):
 #   Column             Non-Null Count  Dtype 
---  ------             --------------  ----- 
 0   Posted On          4746 non-null   object
 1   BHK                4746 non-null   int64 
 2   Rent               4746 non-null   int64 
 3   Size               4746 non-null   int64 
 4   Floor              4746 non-null   object
 5   Area Type          4746 non-null   object
 6   Area Locality      4746 non-null   object
 7   City               4746 non-null   object
 8   Furnishing Status  4746 non-null   object
 9   Tenant Preferred   4746 non-null   object
 10  Bathroom           4746 non-null   int64 
 11  Point of Contact   4746 non-null   object
dtypes: int64(4), object(8)
memory usage: 445.1+ KB


In [113]:
df["Tenant Preferred"].unique()

array(['Bachelors/Family', 'Bachelors', 'Family'], dtype=object)

In [114]:
df = df.drop(columns=['Point of Contact'])
df.drop(columns='Area Locality', inplace=True)

In [115]:
df.columns

Index(['Posted On', 'BHK', 'Rent', 'Size', 'Floor', 'Area Type', 'City',
       'Furnishing Status', 'Tenant Preferred', 'Bathroom'],
      dtype='object')

In [116]:
def preprocess_and_encode_rental_data(df):
    from sklearn.preprocessing import OneHotEncoder
    from sklearn.compose import ColumnTransformer

    if 'Posted On' in df.columns:
        df['Posted On'] = pd.to_datetime(df['Posted On'], errors='coerce')
        df['Posted_Month'] = df['Posted On'].dt.month
        df['Posted_Year'] = df['Posted On'].dt.year
        df.drop(columns='Posted On', inplace=True)

    if 'Floor' in df.columns:
        def process_floor(floor_str):
            if pd.isnull(floor_str):
                return np.nan, np.nan
            parts = floor_str.split(' out of ')
            level = parts[0].strip()
            total = parts[1].strip() if len(parts) > 1 else np.nan

            if level == 'Ground':
                level_num = 0
            elif level == 'Upper Basement':
                level_num = -1
            else:
                try:
                    level_num = int(level)
                except:
                    level_num = np.nan

            try:
                total_floors = int(total)
            except:
                total_floors = np.nan

            return level_num, total_floors

        df[['Floor_Level', 'Total_Floors']] = df['Floor'].apply(lambda x: pd.Series(process_floor(x)))
        df.drop(columns='Floor', inplace=True)

    if 'Tenant Preferred' in df.columns:
        df['Tenant_Bachelors'] = df['Tenant Preferred'].apply(lambda x: 1 if 'Bachelors' in x else 0)
        df['Tenant_Family'] = df['Tenant Preferred'].apply(lambda x: 1 if 'Family' in x else 0)
        df.drop(columns='Tenant Preferred', inplace=True)

    def cap_outliers_iqr(df, columns):
        for col in columns:
            if col in df.columns:
                Q1 = df[col].quantile(0.25)
                Q3 = df[col].quantile(0.75)
                IQR = Q3 - Q1
                lower_bound = Q1 - 1.5 * IQR
                upper_bound = Q3 + 1.5 * IQR
                df[col] = df[col].clip(lower=lower_bound, upper=upper_bound)
        return df

    columns_to_cap = ['Rent', 'Size', 'Bathroom']
    existing_columns_to_cap = [col for col in columns_to_cap if col in df.columns]
    df = cap_outliers_iqr(df, existing_columns_to_cap)

    categorical_cols = ['Area Type', 'City', 'Furnishing Status']

    missing_cols = [col for col in categorical_cols if col not in df.columns]
    if missing_cols:
        raise ValueError(f"The following columns are missing from the DataFrame: {missing_cols}. "
                         f"Please ensure they are present before proceeding with OneHotEncoding.")

    preprocessor = ColumnTransformer(
        transformers=[('cat', OneHotEncoder(handle_unknown='ignore', drop='first', sparse_output=False), categorical_cols)],
        remainder='passthrough'
    )

    df_encoded = preprocessor.fit_transform(df)

    encoded_columns = preprocessor.named_transformers_['cat'].get_feature_names_out(categorical_cols)

    non_categorical_cols = [col for col in df.columns if col not in categorical_cols]

    all_columns = list(encoded_columns) + non_categorical_cols

    df = pd.DataFrame(df_encoded, columns=all_columns)

    df['Size'] = pd.to_numeric(df['Size'], errors='coerce')
    df['Bathroom'] = pd.to_numeric(df['Bathroom'], errors='coerce')
    df['Floor_Level'] = pd.to_numeric(df['Floor_Level'], errors='coerce')
    df['Total_Floors'] = pd.to_numeric(df['Total_Floors'], errors='coerce')
    df['Posted_Month'] = pd.to_numeric(df['Posted_Month'], errors='coerce')
    df['Posted_Year'] = pd.to_numeric(df['Posted_Year'], errors='coerce')
    df['BHK'] = pd.to_numeric(df['BHK'], errors='coerce')
    df['Tenant_Bachelors'] = df['Tenant_Bachelors'].astype(int)
    df['Tenant_Family'] = df['Tenant_Family'].astype(int)

    encoded_columns = [col for col in df.columns if col.startswith('Area Type_') or col.startswith('City_') or col.startswith('Furnishing Status_')]
    df[encoded_columns] = df[encoded_columns].astype(int)
    df['Floor_Level'] = df['Floor_Level'].fillna(df['Floor_Level'].mean())
    df['Total_Floors'] = df['Total_Floors'].fillna(df['Total_Floors'].mean())

    return df, preprocessor

In [117]:
X = df.drop('Rent', axis=1)
y = df['Rent']

X_encoded, preprocessor = preprocess_and_encode_rental_data(X)

In [118]:
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error

X_train, X_test, y_train, y_test = train_test_split(X_encoded, y, test_size=0.2, random_state=42)

model = RandomForestRegressor(random_state=42)
model.fit(X_train, y_train)

y_pred = model.predict(X_test)
mse = mean_squared_error(y_test, y_pred)
rmse = mse**0.5
print(f'RMSE: {rmse:.2f}')

RMSE: 54089.56


In [119]:
from xgboost import XGBRegressor

X = df.drop('Rent', axis=1)
y = df['Rent']

X_encoded, preprocessor = preprocess_and_encode_rental_data(X)

X_train, X_test, y_train, y_test = train_test_split(X_encoded, y, test_size=0.2, random_state=42)

xgb_model = XGBRegressor(n_estimators=100, learning_rate=0.1, max_depth=6, random_state=42)
xgb_model.fit(X_train, y_train)

y_pred = xgb_model.predict(X_test)
mse = mean_squared_error(y_test, y_pred)
rmse = mse**0.5
print(f'XGBoost RMSE: {rmse:.2f}')

XGBoost RMSE: 52923.70


In [120]:
importance_dict = xgb_model.get_booster().get_score(importance_type='weight')

importance_df = pd.DataFrame({
    'Feature': list(importance_dict.keys()),
    'Importance': list(importance_dict.values())
})

importance_df = importance_df.sort_values(by='Importance', ascending=False).reset_index(drop=True)

print(importance_df)

                             Feature  Importance
0                       Total_Floors       869.0
1                               Size       795.0
2                        Floor_Level       695.0
3                                BHK       342.0
4                       Posted_Month       339.0
5                           Bathroom       236.0
6              Area Type_Carpet Area       232.0
7                        City_Mumbai       184.0
8   Furnishing Status_Semi-Furnished       182.0
9      Furnishing Status_Unfurnished       147.0
10                     Tenant_Family       133.0
11                        City_Delhi       108.0
12                  Tenant_Bachelors        94.0
13                    City_Hyderabad        91.0
14                      City_Chennai        58.0
15                      City_Kolkata        34.0
16              Area Type_Super Area         8.0


In [121]:
from sklearn.model_selection import GridSearchCV

xgb_base = XGBRegressor(objective='reg:squarederror', random_state=42)

param_grid = {
    'n_estimators': [100, 200],
    'max_depth': [3, 5, 7],
    'learning_rate': [0.01, 0.1, 0.2],
    'subsample': [0.8, 1.0],
    'colsample_bytree': [0.8, 1.0]
}

grid_search = GridSearchCV(
    estimator=xgb_base,
    param_grid=param_grid,
    cv=5,
    scoring='neg_root_mean_squared_error',
    verbose=1,
    n_jobs=-1
)

grid_search.fit(X_train, y_train)

best_model = grid_search.best_estimator_
best_params = grid_search.best_params_
best_rmse = -grid_search.best_score_

print("Best Parameters:", best_params)
print("Best CV RMSE:", round(best_rmse, 2))

y_pred_test = best_model.predict(X_test)
test_rmse = np.sqrt(mean_squared_error(y_test, y_pred_test))
print("Test RMSE:", round(test_rmse, 2))


Fitting 5 folds for each of 72 candidates, totalling 360 fits
Best Parameters: {'colsample_bytree': 0.8, 'learning_rate': 0.1, 'max_depth': 5, 'n_estimators': 100, 'subsample': 1.0}
Best CV RMSE: 54516.84
Test RMSE: 70044.17


In [122]:
import pickle

with open('preprocessor.pkl', 'wb') as f:
    pickle.dump(preprocessor, f)

with open('xgb_model.pkl', 'wb') as f:
    pickle.dump(xgb_model, f)
