In [1]:
import numpy as np
import pandas as pd

In [2]:
df = pd.read_csv("House_Rent_Dataset.csv")

In [3]:
df


Unnamed: 0,Posted On,BHK,Rent,Size,Floor,Area Type,Area Locality,City,Furnishing Status,Tenant Preferred,Bathroom,Point of Contact
0,2022-05-18,2,10000,1100,Ground out of 2,Super Area,Bandel,Kolkata,Unfurnished,Bachelors/Family,2,Contact Owner
1,2022-05-13,2,20000,800,1 out of 3,Super Area,"Phool Bagan, Kankurgachi",Kolkata,Semi-Furnished,Bachelors/Family,1,Contact Owner
2,2022-05-16,2,17000,1000,1 out of 3,Super Area,Salt Lake City Sector 2,Kolkata,Semi-Furnished,Bachelors/Family,1,Contact Owner
3,2022-07-04,2,10000,800,1 out of 2,Super Area,Dumdum Park,Kolkata,Unfurnished,Bachelors/Family,1,Contact Owner
4,2022-05-09,2,7500,850,1 out of 2,Carpet Area,South Dum Dum,Kolkata,Unfurnished,Bachelors,1,Contact Owner
...,...,...,...,...,...,...,...,...,...,...,...,...
4741,2022-05-18,2,15000,1000,3 out of 5,Carpet Area,Bandam Kommu,Hyderabad,Semi-Furnished,Bachelors/Family,2,Contact Owner
4742,2022-05-15,3,29000,2000,1 out of 4,Super Area,"Manikonda, Hyderabad",Hyderabad,Semi-Furnished,Bachelors/Family,3,Contact Owner
4743,2022-07-10,3,35000,1750,3 out of 5,Carpet Area,"Himayath Nagar, NH 7",Hyderabad,Semi-Furnished,Bachelors/Family,3,Contact Agent
4744,2022-07-06,3,45000,1500,23 out of 34,Carpet Area,Gachibowli,Hyderabad,Semi-Furnished,Family,2,Contact Agent


In [4]:
df.nunique()

Posted On              81
BHK                     6
Rent                  243
Size                  615
Floor                 480
Area Type               3
Area Locality        2235
City                    6
Furnishing Status       3
Tenant Preferred        3
Bathroom                8
Point of Contact        3
dtype: int64

In [5]:
df['Posted On'] = pd.to_datetime(df['Posted On'], errors='coerce')

# Extract useful features
df['Posted_Month'] = df['Posted On'].dt.month
df['Posted_Year'] = df['Posted On'].dt.year


In [6]:
df.drop('Posted On', axis=1, inplace=True)


In [7]:
df

Unnamed: 0,BHK,Rent,Size,Floor,Area Type,Area Locality,City,Furnishing Status,Tenant Preferred,Bathroom,Point of Contact,Posted_Month,Posted_Year
0,2,10000,1100,Ground out of 2,Super Area,Bandel,Kolkata,Unfurnished,Bachelors/Family,2,Contact Owner,5,2022
1,2,20000,800,1 out of 3,Super Area,"Phool Bagan, Kankurgachi",Kolkata,Semi-Furnished,Bachelors/Family,1,Contact Owner,5,2022
2,2,17000,1000,1 out of 3,Super Area,Salt Lake City Sector 2,Kolkata,Semi-Furnished,Bachelors/Family,1,Contact Owner,5,2022
3,2,10000,800,1 out of 2,Super Area,Dumdum Park,Kolkata,Unfurnished,Bachelors/Family,1,Contact Owner,7,2022
4,2,7500,850,1 out of 2,Carpet Area,South Dum Dum,Kolkata,Unfurnished,Bachelors,1,Contact Owner,5,2022
...,...,...,...,...,...,...,...,...,...,...,...,...,...
4741,2,15000,1000,3 out of 5,Carpet Area,Bandam Kommu,Hyderabad,Semi-Furnished,Bachelors/Family,2,Contact Owner,5,2022
4742,3,29000,2000,1 out of 4,Super Area,"Manikonda, Hyderabad",Hyderabad,Semi-Furnished,Bachelors/Family,3,Contact Owner,5,2022
4743,3,35000,1750,3 out of 5,Carpet Area,"Himayath Nagar, NH 7",Hyderabad,Semi-Furnished,Bachelors/Family,3,Contact Agent,7,2022
4744,3,45000,1500,23 out of 34,Carpet Area,Gachibowli,Hyderabad,Semi-Furnished,Family,2,Contact Agent,7,2022


In [8]:
def process_floor(floor_str):
    if pd.isnull(floor_str):
        return pd.Series([None, None])
    parts = floor_str.split(" out of ")
    current = parts[0]
    total = parts[1] if len(parts) > 1 else None

    # Handle "Ground" as 0
    curr = current.lower()
    current = (
    -2 if curr == "lower basement"
    else -1 if curr == "upper basement"
    else  0 if curr == "ground"
    else pd.to_numeric(current, errors='coerce'))
    total = pd.to_numeric(total, errors='coerce')

    return pd.Series([current, total])

df[['Current_Floor', 'Total_Floor']] = df['Floor'].apply(process_floor)
df.drop('Floor', axis=1, inplace=True)
# also handle case for "upper basement out of total floor"

In [9]:
df


Unnamed: 0,BHK,Rent,Size,Area Type,Area Locality,City,Furnishing Status,Tenant Preferred,Bathroom,Point of Contact,Posted_Month,Posted_Year,Current_Floor,Total_Floor
0,2,10000,1100,Super Area,Bandel,Kolkata,Unfurnished,Bachelors/Family,2,Contact Owner,5,2022,0.0,2.0
1,2,20000,800,Super Area,"Phool Bagan, Kankurgachi",Kolkata,Semi-Furnished,Bachelors/Family,1,Contact Owner,5,2022,1.0,3.0
2,2,17000,1000,Super Area,Salt Lake City Sector 2,Kolkata,Semi-Furnished,Bachelors/Family,1,Contact Owner,5,2022,1.0,3.0
3,2,10000,800,Super Area,Dumdum Park,Kolkata,Unfurnished,Bachelors/Family,1,Contact Owner,7,2022,1.0,2.0
4,2,7500,850,Carpet Area,South Dum Dum,Kolkata,Unfurnished,Bachelors,1,Contact Owner,5,2022,1.0,2.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4741,2,15000,1000,Carpet Area,Bandam Kommu,Hyderabad,Semi-Furnished,Bachelors/Family,2,Contact Owner,5,2022,3.0,5.0
4742,3,29000,2000,Super Area,"Manikonda, Hyderabad",Hyderabad,Semi-Furnished,Bachelors/Family,3,Contact Owner,5,2022,1.0,4.0
4743,3,35000,1750,Carpet Area,"Himayath Nagar, NH 7",Hyderabad,Semi-Furnished,Bachelors/Family,3,Contact Agent,7,2022,3.0,5.0
4744,3,45000,1500,Carpet Area,Gachibowli,Hyderabad,Semi-Furnished,Family,2,Contact Agent,7,2022,23.0,34.0


In [10]:
# Target Encoding
# Example: assume 'Rent' is the target
# area_target_mean = df.groupby('Area Locality')['Rent'].mean()

# # Map each locality to its mean rent
# df['Area_Locality_Encoded'] = df['Area Locality'].map(area_target_mean)

# # Drop original
# df.drop('Area Locality', axis=1, inplace=True)
# This caususe Leakage

In [11]:

from sklearn.preprocessing import LabelEncoder

cols_to_label_encode = ['Area Type', 'City', 'Furnishing Status', 'Tenant Preferred', 'Point of Contact']

le_dict = {}
for col in cols_to_label_encode:
    le = LabelEncoder()
    df[col] = le.fit_transform(df[col])
    le_dict[col] = le  # save encoder if needed later


In [12]:
df

Unnamed: 0,BHK,Rent,Size,Area Type,Area Locality,City,Furnishing Status,Tenant Preferred,Bathroom,Point of Contact,Posted_Month,Posted_Year,Current_Floor,Total_Floor
0,2,10000,1100,2,Bandel,4,2,1,2,2,5,2022,0.0,2.0
1,2,20000,800,2,"Phool Bagan, Kankurgachi",4,1,1,1,2,5,2022,1.0,3.0
2,2,17000,1000,2,Salt Lake City Sector 2,4,1,1,1,2,5,2022,1.0,3.0
3,2,10000,800,2,Dumdum Park,4,2,1,1,2,7,2022,1.0,2.0
4,2,7500,850,1,South Dum Dum,4,2,0,1,2,5,2022,1.0,2.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4741,2,15000,1000,1,Bandam Kommu,3,1,1,2,2,5,2022,3.0,5.0
4742,3,29000,2000,2,"Manikonda, Hyderabad",3,1,1,3,2,5,2022,1.0,4.0
4743,3,35000,1750,1,"Himayath Nagar, NH 7",3,1,1,3,0,7,2022,3.0,5.0
4744,3,45000,1500,1,Gachibowli,3,1,2,2,0,7,2022,23.0,34.0


In [13]:
df.nunique()

BHK                     6
Rent                  243
Size                  615
Area Type               3
Area Locality        2235
City                    6
Furnishing Status       3
Tenant Preferred        3
Bathroom                8
Point of Contact        3
Posted_Month            4
Posted_Year             1
Current_Floor          54
Total_Floor            66
dtype: int64

In [14]:
df['Rent_raw'] = df['Rent']        # backup original


In [15]:
from sklearn.preprocessing import StandardScaler

# Initialize the scaler
# rent_scaler = StandardScaler()
size_scaler = StandardScaler()
# Scale Rent column
# df['Rent'] = rent_scaler.fit_transform(df[['Rent']])

# Scale Size column
df['Size'] = size_scaler.fit_transform(df[['Size']])


In [16]:
df


Unnamed: 0,BHK,Rent,Size,Area Type,Area Locality,City,Furnishing Status,Tenant Preferred,Bathroom,Point of Contact,Posted_Month,Posted_Year,Current_Floor,Total_Floor,Rent_raw
0,2,10000,0.208960,2,Bandel,4,2,1,2,2,5,2022,0.0,2.0,10000
1,2,20000,-0.264125,2,"Phool Bagan, Kankurgachi",4,1,1,1,2,5,2022,1.0,3.0,20000
2,2,17000,0.051265,2,Salt Lake City Sector 2,4,1,1,1,2,5,2022,1.0,3.0,17000
3,2,10000,-0.264125,2,Dumdum Park,4,2,1,1,2,7,2022,1.0,2.0,10000
4,2,7500,-0.185277,1,South Dum Dum,4,2,0,1,2,5,2022,1.0,2.0,7500
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4741,2,15000,0.051265,1,Bandam Kommu,3,1,1,2,2,5,2022,3.0,5.0,15000
4742,3,29000,1.628216,2,"Manikonda, Hyderabad",3,1,1,3,2,5,2022,1.0,4.0,29000
4743,3,35000,1.233978,1,"Himayath Nagar, NH 7",3,1,1,3,0,7,2022,3.0,5.0,35000
4744,3,45000,0.839741,1,Gachibowli,3,1,2,2,0,7,2022,23.0,34.0,45000


In [17]:
import pandas as pd
from sklearn.model_selection import train_test_split

# 0. (Optional) Clean column names
df.columns = df.columns.str.strip()

# 1. Split your dataset first
train_df, test_df = train_test_split(df, test_size=0.2, random_state=42)

# 2. Compute mean rent per locality from training data only
target_map = train_df.groupby('Area Locality')['Rent_raw'].mean()

# 3. Map it back to train and test
train_df['Area_Locality_TE'] = train_df['Area Locality'].map(target_map)
test_df['Area_Locality_TE']  = test_df['Area Locality'].map(target_map)

# 4. Handle unseen localities in test set (fill with overall train mean)
overall_mean = train_df['Rent_raw'].mean()
test_df['Area_Locality_TE'] = test_df['Area_Locality_TE'].fillna(overall_mean)





In [18]:
def smooth_target_encoding(df, target_col, cat_col, alpha=10):
    global_mean = df[target_col].mean()
    stats = df.groupby(cat_col)[target_col].agg(['mean', 'count'])
    # Bayesian smoothing: weight category mean by count vs. global mean by alpha
    smooth = (stats['mean'] * stats['count'] + global_mean * alpha) / (stats['count'] + alpha)
    return smooth

# Compute a smoothed mapping on train only
smoothed_map = smooth_target_encoding(train_df, 'Rent_raw', 'Area Locality', alpha=10)

train_df['Area_Locality_TE'] = train_df['Area Locality'].map(smoothed_map)
test_df['Area_Locality_TE']  = test_df['Area Locality'].map(smoothed_map).fillna(overall_mean)

# Then drop the original column as before
# train_df = train_df.drop(columns=['Area Locality'])
# test_df  = test_df.drop(columns=['Area Locality'])
train_df.drop(columns=['Rent_raw','Area Locality'], inplace=True)
test_df.drop(columns=['Rent_raw','Area Locality'], inplace=True)


In [19]:
from sklearn.preprocessing import StandardScaler

scaler_te = StandardScaler()

# Fit and transform using .loc to avoid SettingWithCopyWarning
train_df.loc[:, 'Area_Locality_TE'] = scaler_te.fit_transform(train_df[['Area_Locality_TE']])
test_df.loc[:, 'Area_Locality_TE'] = scaler_te.transform(test_df[['Area_Locality_TE']])


In [20]:
df


Unnamed: 0,BHK,Rent,Size,Area Type,Area Locality,City,Furnishing Status,Tenant Preferred,Bathroom,Point of Contact,Posted_Month,Posted_Year,Current_Floor,Total_Floor,Rent_raw
0,2,10000,0.208960,2,Bandel,4,2,1,2,2,5,2022,0.0,2.0,10000
1,2,20000,-0.264125,2,"Phool Bagan, Kankurgachi",4,1,1,1,2,5,2022,1.0,3.0,20000
2,2,17000,0.051265,2,Salt Lake City Sector 2,4,1,1,1,2,5,2022,1.0,3.0,17000
3,2,10000,-0.264125,2,Dumdum Park,4,2,1,1,2,7,2022,1.0,2.0,10000
4,2,7500,-0.185277,1,South Dum Dum,4,2,0,1,2,5,2022,1.0,2.0,7500
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4741,2,15000,0.051265,1,Bandam Kommu,3,1,1,2,2,5,2022,3.0,5.0,15000
4742,3,29000,1.628216,2,"Manikonda, Hyderabad",3,1,1,3,2,5,2022,1.0,4.0,29000
4743,3,35000,1.233978,1,"Himayath Nagar, NH 7",3,1,1,3,0,7,2022,3.0,5.0,35000
4744,3,45000,0.839741,1,Gachibowli,3,1,2,2,0,7,2022,23.0,34.0,45000


In [21]:
train_df

Unnamed: 0,BHK,Rent,Size,Area Type,City,Furnishing Status,Tenant Preferred,Bathroom,Point of Contact,Posted_Month,Posted_Year,Current_Floor,Total_Floor,Area_Locality_TE
1995,2,23500,-0.027582,2,0,1,1,2,2,5,2022,6.0,8.0,-0.305073
1497,2,10500,-0.264125,2,0,1,1,2,2,6,2022,3.0,4.0,-0.120669
2763,1,19000,-0.666247,2,2,0,1,1,2,6,2022,2.0,3.0,-0.255441
1351,1,20000,-0.792403,2,5,1,0,1,2,7,2022,3.0,4.0,-0.067241
1862,2,17000,-0.106430,2,0,0,1,2,2,7,2022,4.0,5.0,-0.084113
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4426,3,25000,0.839741,1,3,1,2,3,2,6,2022,1.0,2.0,-0.039122
466,3,20000,0.366655,2,4,2,1,2,2,6,2022,3.0,3.0,-0.517189
3092,2,20000,-0.264125,1,1,1,2,2,0,7,2022,13.0,17.0,-0.238819
3772,3,85000,3.993641,1,1,1,1,3,0,5,2022,0.0,1.0,0.607174


In [22]:
train_df.isnull().sum()


BHK                  0
Rent                 0
Size                 0
Area Type            0
City                 0
Furnishing Status    0
Tenant Preferred     0
Bathroom             0
Point of Contact     0
Posted_Month         0
Posted_Year          0
Current_Floor        0
Total_Floor          4
Area_Locality_TE     0
dtype: int64

In [23]:
train_df = train_df.dropna()

In [24]:
train_df.isnull().sum()


BHK                  0
Rent                 0
Size                 0
Area Type            0
City                 0
Furnishing Status    0
Tenant Preferred     0
Bathroom             0
Point of Contact     0
Posted_Month         0
Posted_Year          0
Current_Floor        0
Total_Floor          0
Area_Locality_TE     0
dtype: int64

In [25]:
test_df

Unnamed: 0,BHK,Rent,Size,Area Type,City,Furnishing Status,Tenant Preferred,Bathroom,Point of Contact,Posted_Month,Posted_Year,Current_Floor,Total_Floor,Area_Locality_TE
1566,2,16000,0.208960,2,0,2,1,2,2,6,2022,2.0,4.0,0.017970
3159,2,12000,-0.264125,2,1,0,1,2,2,5,2022,2.0,3.0,0.017970
538,2,28000,-0.708824,1,5,1,1,2,2,5,2022,5.0,12.0,-0.009369
2630,3,8000,0.839741,1,2,2,1,3,2,6,2022,1.0,1.0,0.017970
4418,3,46000,1.998799,1,3,2,2,5,0,7,2022,0.0,10.0,0.400428
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4124,2,18000,0.442349,1,3,1,1,2,2,4,2022,4.0,5.0,-0.357754
3400,3,25000,0.208960,2,1,2,1,2,2,5,2022,0.0,2.0,-0.112233
1941,2,14000,-0.027582,2,0,2,1,2,2,6,2022,2.0,4.0,-0.847940
3679,3,26000,0.248384,2,1,1,1,2,2,5,2022,2.0,3.0,-0.403765


In [26]:
test_df.isnull().sum()


BHK                  0
Rent                 0
Size                 0
Area Type            0
City                 0
Furnishing Status    0
Tenant Preferred     0
Bathroom             0
Point of Contact     0
Posted_Month         0
Posted_Year          0
Current_Floor        0
Total_Floor          0
Area_Locality_TE     0
dtype: int64

In [27]:
train_df.shape

(3792, 14)

In [50]:
import numpy as np
import pandas as pd

from sklearn.model_selection   import train_test_split
from sklearn.preprocessing     import StandardScaler, OneHotEncoder
from sklearn.compose           import ColumnTransformer
from sklearn.pipeline          import Pipeline
from sklearn.ensemble          import RandomForestRegressor
from sklearn.metrics           import mean_absolute_error, mean_squared_error, r2_score
from math import sqrt



# 1) Split out X and y
X = train_df.drop(columns=['Rent'])
y = train_df['Rent'].values

# 2) Train/val split
X_train, X_val, y_train, y_val = train_test_split(
    X, y, test_size=0.2, random_state=42
)

# 3) Auto‑select numeric vs. categorical
numeric_feats = X_train.select_dtypes(include=[np.number]).columns.tolist()
cat_feats     = X_train.select_dtypes(include=['object','category']).columns.tolist()

print("Numerical features:", numeric_feats)
print("Categorical features:", cat_feats)

# 4) Build ColumnTransformer
preprocessor = ColumnTransformer([
    ('num', StandardScaler(),             numeric_feats),
    # ('cat', OneHotEncoder(handle_unknown='ignore', sparse_output=False), cat_feats),
])

# 5) Full pipeline
model = Pipeline([
    ('preproc', preprocessor),
    ('reg',     RandomForestRegressor(
                    n_estimators=100,
                    random_state=42,
                    n_jobs=-1
               ))
])

# 6) Train
model.fit(X_train, y_train)

# 7) Predict & evaluate
preds = model.predict(X_val)
mae  = mean_absolute_error   (y_val, preds)
rmse = sqrt(mean_squared_error(y_val, preds))
r2   = r2_score               (y_val, preds)

print(f"RF MAE: ₹{mae:,.0f}, RMSE: ₹{rmse:,.0f}, R²: {r2:.3f}")


Numerical features: ['BHK', 'Size', 'Area Type', 'City', 'Furnishing Status', 'Tenant Preferred', 'Bathroom', 'Point of Contact', 'Posted_Month', 'Posted_Year', 'Current_Floor', 'Total_Floor', 'Area_Locality_TE']
Categorical features: []
RF MAE: ₹7,520, RMSE: ₹27,095, R²: 0.832


In [52]:
import joblib
joblib.dump(model, 'random_forest_pipeline.joblib')


['random_forest_pipeline.joblib']

In [64]:
import pandas as pd
import joblib

# Load the trained model pipeline
model = joblib.load('random_forest_pipeline.joblib')

# Input values (must match feature order and type exactly)
input_data = {
    'BHK': [2],
    'Size': [1100],
    'Area Type': [1],              # e.g., 0: Super built-up, 1: Built-up, etc.
    'City': [2],                   # e.g., 0: Bangalore, 1: Chennai, ...
    'Furnishing Status': [0],     # e.g., 0: Unfurnished, 1: Semi, 2: Furnished
    'Tenant Preferred': [1],      # e.g., 0: Bachelors, 1: Family, etc.
    'Bathroom': [2],
    'Point of Contact': [2],      # e.g., 0: Owner, 1: Agent, etc.
    'Posted_Month': [4],
    'Posted_Year': [2023],
    'Current_Floor': [3],
    'Total_Floor': [5],
    'Area_Locality_TE': [0.728]   # e.g., label-encoded or TF-IDF score
}

# Convert to DataFrame
input_df = pd.DataFrame(input_data)

# Predict
predicted_rent = model.predict(input_df)[0]
print(f"Predicted Rent: ₹{predicted_rent:,.0f}")


Predicted Rent: ₹62,019
