In [2]:
import numpy as np
import pandas as pd

In [3]:
df = pd.read_csv("House_Rent_Dataset.csv")

In [4]:
df


Unnamed: 0,Posted On,BHK,Rent,Size,Floor,Area Type,Area Locality,City,Furnishing Status,Tenant Preferred,Bathroom,Point of Contact
0,2022-05-18,2,10000,1100,Ground out of 2,Super Area,Bandel,Kolkata,Unfurnished,Bachelors/Family,2,Contact Owner
1,2022-05-13,2,20000,800,1 out of 3,Super Area,"Phool Bagan, Kankurgachi",Kolkata,Semi-Furnished,Bachelors/Family,1,Contact Owner
2,2022-05-16,2,17000,1000,1 out of 3,Super Area,Salt Lake City Sector 2,Kolkata,Semi-Furnished,Bachelors/Family,1,Contact Owner
3,2022-07-04,2,10000,800,1 out of 2,Super Area,Dumdum Park,Kolkata,Unfurnished,Bachelors/Family,1,Contact Owner
4,2022-05-09,2,7500,850,1 out of 2,Carpet Area,South Dum Dum,Kolkata,Unfurnished,Bachelors,1,Contact Owner
...,...,...,...,...,...,...,...,...,...,...,...,...
4741,2022-05-18,2,15000,1000,3 out of 5,Carpet Area,Bandam Kommu,Hyderabad,Semi-Furnished,Bachelors/Family,2,Contact Owner
4742,2022-05-15,3,29000,2000,1 out of 4,Super Area,"Manikonda, Hyderabad",Hyderabad,Semi-Furnished,Bachelors/Family,3,Contact Owner
4743,2022-07-10,3,35000,1750,3 out of 5,Carpet Area,"Himayath Nagar, NH 7",Hyderabad,Semi-Furnished,Bachelors/Family,3,Contact Agent
4744,2022-07-06,3,45000,1500,23 out of 34,Carpet Area,Gachibowli,Hyderabad,Semi-Furnished,Family,2,Contact Agent


In [5]:
df.nunique()

Posted On              81
BHK                     6
Rent                  243
Size                  615
Floor                 480
Area Type               3
Area Locality        2235
City                    6
Furnishing Status       3
Tenant Preferred        3
Bathroom                8
Point of Contact        3
dtype: int64

In [6]:
df['Posted On'] = pd.to_datetime(df['Posted On'], errors='coerce')

# Extract useful features
df['Posted_Month'] = df['Posted On'].dt.month
df['Posted_Year'] = df['Posted On'].dt.year


In [7]:
df.drop('Posted On', axis=1, inplace=True)


In [8]:
df

Unnamed: 0,BHK,Rent,Size,Floor,Area Type,Area Locality,City,Furnishing Status,Tenant Preferred,Bathroom,Point of Contact,Posted_Month,Posted_Year
0,2,10000,1100,Ground out of 2,Super Area,Bandel,Kolkata,Unfurnished,Bachelors/Family,2,Contact Owner,5,2022
1,2,20000,800,1 out of 3,Super Area,"Phool Bagan, Kankurgachi",Kolkata,Semi-Furnished,Bachelors/Family,1,Contact Owner,5,2022
2,2,17000,1000,1 out of 3,Super Area,Salt Lake City Sector 2,Kolkata,Semi-Furnished,Bachelors/Family,1,Contact Owner,5,2022
3,2,10000,800,1 out of 2,Super Area,Dumdum Park,Kolkata,Unfurnished,Bachelors/Family,1,Contact Owner,7,2022
4,2,7500,850,1 out of 2,Carpet Area,South Dum Dum,Kolkata,Unfurnished,Bachelors,1,Contact Owner,5,2022
...,...,...,...,...,...,...,...,...,...,...,...,...,...
4741,2,15000,1000,3 out of 5,Carpet Area,Bandam Kommu,Hyderabad,Semi-Furnished,Bachelors/Family,2,Contact Owner,5,2022
4742,3,29000,2000,1 out of 4,Super Area,"Manikonda, Hyderabad",Hyderabad,Semi-Furnished,Bachelors/Family,3,Contact Owner,5,2022
4743,3,35000,1750,3 out of 5,Carpet Area,"Himayath Nagar, NH 7",Hyderabad,Semi-Furnished,Bachelors/Family,3,Contact Agent,7,2022
4744,3,45000,1500,23 out of 34,Carpet Area,Gachibowli,Hyderabad,Semi-Furnished,Family,2,Contact Agent,7,2022


In [9]:
def process_floor(floor_str):
    if pd.isnull(floor_str):
        return pd.Series([None, None])
    parts = floor_str.split(" out of ")
    current = parts[0]
    total = parts[1] if len(parts) > 1 else None

    # Handle "Ground" as 0
    curr = current.lower()
    current = (
    -2 if curr == "lower basement"
    else -1 if curr == "upper basement"
    else  0 if curr == "ground"
    else pd.to_numeric(current, errors='coerce'))
    total = pd.to_numeric(total, errors='coerce')

    return pd.Series([current, total])

df[['Current_Floor', 'Total_Floor']] = df['Floor'].apply(process_floor)
df.drop('Floor', axis=1, inplace=True)
# also handle case for "upper basement out of total floor"

In [10]:
df


Unnamed: 0,BHK,Rent,Size,Area Type,Area Locality,City,Furnishing Status,Tenant Preferred,Bathroom,Point of Contact,Posted_Month,Posted_Year,Current_Floor,Total_Floor
0,2,10000,1100,Super Area,Bandel,Kolkata,Unfurnished,Bachelors/Family,2,Contact Owner,5,2022,0.0,2.0
1,2,20000,800,Super Area,"Phool Bagan, Kankurgachi",Kolkata,Semi-Furnished,Bachelors/Family,1,Contact Owner,5,2022,1.0,3.0
2,2,17000,1000,Super Area,Salt Lake City Sector 2,Kolkata,Semi-Furnished,Bachelors/Family,1,Contact Owner,5,2022,1.0,3.0
3,2,10000,800,Super Area,Dumdum Park,Kolkata,Unfurnished,Bachelors/Family,1,Contact Owner,7,2022,1.0,2.0
4,2,7500,850,Carpet Area,South Dum Dum,Kolkata,Unfurnished,Bachelors,1,Contact Owner,5,2022,1.0,2.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4741,2,15000,1000,Carpet Area,Bandam Kommu,Hyderabad,Semi-Furnished,Bachelors/Family,2,Contact Owner,5,2022,3.0,5.0
4742,3,29000,2000,Super Area,"Manikonda, Hyderabad",Hyderabad,Semi-Furnished,Bachelors/Family,3,Contact Owner,5,2022,1.0,4.0
4743,3,35000,1750,Carpet Area,"Himayath Nagar, NH 7",Hyderabad,Semi-Furnished,Bachelors/Family,3,Contact Agent,7,2022,3.0,5.0
4744,3,45000,1500,Carpet Area,Gachibowli,Hyderabad,Semi-Furnished,Family,2,Contact Agent,7,2022,23.0,34.0


In [11]:
# Target Encoding
# Example: assume 'Rent' is the target
# area_target_mean = df.groupby('Area Locality')['Rent'].mean()

# # Map each locality to its mean rent
# df['Area_Locality_Encoded'] = df['Area Locality'].map(area_target_mean)

# # Drop original
# df.drop('Area Locality', axis=1, inplace=True)
# This caususe Leakage

In [12]:

from sklearn.preprocessing import LabelEncoder

cols_to_label_encode = ['Area Type', 'City', 'Furnishing Status', 'Tenant Preferred', 'Point of Contact']

le_dict = {}
for col in cols_to_label_encode:
    le = LabelEncoder()
    df[col] = le.fit_transform(df[col])
    le_dict[col] = le  # save encoder if needed later


In [13]:
df

Unnamed: 0,BHK,Rent,Size,Area Type,Area Locality,City,Furnishing Status,Tenant Preferred,Bathroom,Point of Contact,Posted_Month,Posted_Year,Current_Floor,Total_Floor
0,2,10000,1100,2,Bandel,4,2,1,2,2,5,2022,0.0,2.0
1,2,20000,800,2,"Phool Bagan, Kankurgachi",4,1,1,1,2,5,2022,1.0,3.0
2,2,17000,1000,2,Salt Lake City Sector 2,4,1,1,1,2,5,2022,1.0,3.0
3,2,10000,800,2,Dumdum Park,4,2,1,1,2,7,2022,1.0,2.0
4,2,7500,850,1,South Dum Dum,4,2,0,1,2,5,2022,1.0,2.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4741,2,15000,1000,1,Bandam Kommu,3,1,1,2,2,5,2022,3.0,5.0
4742,3,29000,2000,2,"Manikonda, Hyderabad",3,1,1,3,2,5,2022,1.0,4.0
4743,3,35000,1750,1,"Himayath Nagar, NH 7",3,1,1,3,0,7,2022,3.0,5.0
4744,3,45000,1500,1,Gachibowli,3,1,2,2,0,7,2022,23.0,34.0


In [14]:
df.nunique()

BHK                     6
Rent                  243
Size                  615
Area Type               3
Area Locality        2235
City                    6
Furnishing Status       3
Tenant Preferred        3
Bathroom                8
Point of Contact        3
Posted_Month            4
Posted_Year             1
Current_Floor          54
Total_Floor            66
dtype: int64

In [15]:
df['Rent_raw'] = df['Rent']        # backup original


In [16]:
from sklearn.preprocessing import StandardScaler

# Initialize the scaler
rent_scaler = StandardScaler()
size_scaler = StandardScaler()
# Scale Rent column
df['Rent'] = rent_scaler.fit_transform(df[['Rent']])

# Scale Size column
df['Size'] = size_scaler.fit_transform(df[['Size']])


In [17]:
df


Unnamed: 0,BHK,Rent,Size,Area Type,Area Locality,City,Furnishing Status,Tenant Preferred,Bathroom,Point of Contact,Posted_Month,Posted_Year,Current_Floor,Total_Floor,Rent_raw
0,2,-0.320026,0.208960,2,Bandel,4,2,1,2,2,5,2022,0.0,2.0,10000
1,2,-0.191982,-0.264125,2,"Phool Bagan, Kankurgachi",4,1,1,1,2,5,2022,1.0,3.0,20000
2,2,-0.230395,0.051265,2,Salt Lake City Sector 2,4,1,1,1,2,5,2022,1.0,3.0,17000
3,2,-0.320026,-0.264125,2,Dumdum Park,4,2,1,1,2,7,2022,1.0,2.0,10000
4,2,-0.352037,-0.185277,1,South Dum Dum,4,2,0,1,2,5,2022,1.0,2.0,7500
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4741,2,-0.256004,0.051265,1,Bandam Kommu,3,1,1,2,2,5,2022,3.0,5.0,15000
4742,3,-0.076743,1.628216,2,"Manikonda, Hyderabad",3,1,1,3,2,5,2022,1.0,4.0,29000
4743,3,0.000084,1.233978,1,"Himayath Nagar, NH 7",3,1,1,3,0,7,2022,3.0,5.0,35000
4744,3,0.128128,0.839741,1,Gachibowli,3,1,2,2,0,7,2022,23.0,34.0,45000


In [18]:
import pandas as pd
from sklearn.model_selection import train_test_split

# 0. (Optional) Clean column names
df.columns = df.columns.str.strip()

# 1. Split your dataset first
train_df, test_df = train_test_split(df, test_size=0.2, random_state=42)

# 2. Compute mean rent per locality from training data only
target_map = train_df.groupby('Area Locality')['Rent_raw'].mean()

# 3. Map it back to train and test
train_df['Area_Locality_TE'] = train_df['Area Locality'].map(target_map)
test_df['Area_Locality_TE']  = test_df['Area Locality'].map(target_map)

# 4. Handle unseen localities in test set (fill with overall train mean)
overall_mean = train_df['Rent_raw'].mean()
test_df['Area_Locality_TE'] = test_df['Area_Locality_TE'].fillna(overall_mean)





In [19]:
def smooth_target_encoding(df, target_col, cat_col, alpha=10):
    global_mean = df[target_col].mean()
    stats = df.groupby(cat_col)[target_col].agg(['mean', 'count'])
    # Bayesian smoothing: weight category mean by count vs. global mean by alpha
    smooth = (stats['mean'] * stats['count'] + global_mean * alpha) / (stats['count'] + alpha)
    return smooth

# Compute a smoothed mapping on train only
smoothed_map = smooth_target_encoding(train_df, 'Rent_raw', 'Area Locality', alpha=10)

train_df['Area_Locality_TE'] = train_df['Area Locality'].map(smoothed_map)
test_df['Area_Locality_TE']  = test_df['Area Locality'].map(smoothed_map).fillna(overall_mean)

# Then drop the original column as before
# train_df = train_df.drop(columns=['Area Locality'])
# test_df  = test_df.drop(columns=['Area Locality'])
train_df.drop(columns=['Rent_raw','Area Locality'], inplace=True)
test_df.drop(columns=['Rent_raw','Area Locality'], inplace=True)


In [20]:
from sklearn.preprocessing import StandardScaler

scaler_te = StandardScaler()

# Fit and transform using .loc to avoid SettingWithCopyWarning
train_df.loc[:, 'Area_Locality_TE'] = scaler_te.fit_transform(train_df[['Area_Locality_TE']])
test_df.loc[:, 'Area_Locality_TE'] = scaler_te.transform(test_df[['Area_Locality_TE']])


In [21]:
df


Unnamed: 0,BHK,Rent,Size,Area Type,Area Locality,City,Furnishing Status,Tenant Preferred,Bathroom,Point of Contact,Posted_Month,Posted_Year,Current_Floor,Total_Floor,Rent_raw
0,2,-0.320026,0.208960,2,Bandel,4,2,1,2,2,5,2022,0.0,2.0,10000
1,2,-0.191982,-0.264125,2,"Phool Bagan, Kankurgachi",4,1,1,1,2,5,2022,1.0,3.0,20000
2,2,-0.230395,0.051265,2,Salt Lake City Sector 2,4,1,1,1,2,5,2022,1.0,3.0,17000
3,2,-0.320026,-0.264125,2,Dumdum Park,4,2,1,1,2,7,2022,1.0,2.0,10000
4,2,-0.352037,-0.185277,1,South Dum Dum,4,2,0,1,2,5,2022,1.0,2.0,7500
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4741,2,-0.256004,0.051265,1,Bandam Kommu,3,1,1,2,2,5,2022,3.0,5.0,15000
4742,3,-0.076743,1.628216,2,"Manikonda, Hyderabad",3,1,1,3,2,5,2022,1.0,4.0,29000
4743,3,0.000084,1.233978,1,"Himayath Nagar, NH 7",3,1,1,3,0,7,2022,3.0,5.0,35000
4744,3,0.128128,0.839741,1,Gachibowli,3,1,2,2,0,7,2022,23.0,34.0,45000


In [22]:
train_df

Unnamed: 0,BHK,Rent,Size,Area Type,City,Furnishing Status,Tenant Preferred,Bathroom,Point of Contact,Posted_Month,Posted_Year,Current_Floor,Total_Floor,Area_Locality_TE
1995,2,-0.147167,-0.027582,2,0,1,1,2,2,5,2022,6.0,8.0,-0.305073
1497,2,-0.313624,-0.264125,2,0,1,1,2,2,6,2022,3.0,4.0,-0.120669
2763,1,-0.204786,-0.666247,2,2,0,1,1,2,6,2022,2.0,3.0,-0.255441
1351,1,-0.191982,-0.792403,2,5,1,0,1,2,7,2022,3.0,4.0,-0.067241
1862,2,-0.230395,-0.106430,2,0,0,1,2,2,7,2022,4.0,5.0,-0.084113
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4426,3,-0.127960,0.839741,1,3,1,2,3,2,6,2022,1.0,2.0,-0.039122
466,3,-0.191982,0.366655,2,4,2,1,2,2,6,2022,3.0,3.0,-0.517189
3092,2,-0.191982,-0.264125,1,1,1,2,2,0,7,2022,13.0,17.0,-0.238819
3772,3,0.640304,3.993641,1,1,1,1,3,0,5,2022,0.0,1.0,0.607174


In [23]:
train_df.isnull().sum()


BHK                  0
Rent                 0
Size                 0
Area Type            0
City                 0
Furnishing Status    0
Tenant Preferred     0
Bathroom             0
Point of Contact     0
Posted_Month         0
Posted_Year          0
Current_Floor        0
Total_Floor          4
Area_Locality_TE     0
dtype: int64

In [24]:
train_df = train_df.dropna()

In [25]:
train_df.isnull().sum()


BHK                  0
Rent                 0
Size                 0
Area Type            0
City                 0
Furnishing Status    0
Tenant Preferred     0
Bathroom             0
Point of Contact     0
Posted_Month         0
Posted_Year          0
Current_Floor        0
Total_Floor          0
Area_Locality_TE     0
dtype: int64

In [26]:
test_df

Unnamed: 0,BHK,Rent,Size,Area Type,City,Furnishing Status,Tenant Preferred,Bathroom,Point of Contact,Posted_Month,Posted_Year,Current_Floor,Total_Floor,Area_Locality_TE
1566,2,-0.243200,0.208960,2,0,2,1,2,2,6,2022,2.0,4.0,0.017970
3159,2,-0.294417,-0.264125,2,1,0,1,2,2,5,2022,2.0,3.0,0.017970
538,2,-0.089547,-0.708824,1,5,1,1,2,2,5,2022,5.0,12.0,-0.009369
2630,3,-0.345635,0.839741,1,2,2,1,3,2,6,2022,1.0,1.0,0.017970
4418,3,0.140932,1.998799,1,3,2,2,5,0,7,2022,0.0,10.0,0.400428
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4124,2,-0.217591,0.442349,1,3,1,1,2,2,4,2022,4.0,5.0,-0.357754
3400,3,-0.127960,0.208960,2,1,2,1,2,2,5,2022,0.0,2.0,-0.112233
1941,2,-0.268808,-0.027582,2,0,2,1,2,2,6,2022,2.0,4.0,-0.847940
3679,3,-0.115156,0.248384,2,1,1,1,2,2,5,2022,2.0,3.0,-0.403765


In [27]:
test_df.isnull().sum()


BHK                  0
Rent                 0
Size                 0
Area Type            0
City                 0
Furnishing Status    0
Tenant Preferred     0
Bathroom             0
Point of Contact     0
Posted_Month         0
Posted_Year          0
Current_Floor        0
Total_Floor          0
Area_Locality_TE     0
dtype: int64

In [28]:
train_df.shape

(3792, 14)

In [29]:
import torch
import torch.nn as nn
import torch.optim as optim
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
import pandas as pd
import numpy as np

# Assuming you already have train_df loaded
X = train_df.drop(columns=['Rent'])
y = train_df['Rent'].values


In [30]:
X_train, X_val, y_train, y_val = train_test_split(
    X, y, test_size=0.2, random_state=42
)


In [31]:
X_train_tensor = torch.tensor(X_train.to_numpy(), dtype=torch.float32)
y_train_tensor = torch.tensor(y_train, dtype=torch.float32).view(-1, 1)

X_val_tensor = torch.tensor(X_val.to_numpy(), dtype=torch.float32)
y_val_tensor = torch.tensor(y_val, dtype=torch.float32).view(-1, 1)


In [231]:
import torch.nn as nn

class RentPredictor(nn.Module):
    def __init__(self, input_dim):
        super(RentPredictor, self).__init__()
        self.model = nn.Sequential(
            nn.Linear(input_dim, 128),
            nn.ReLU(),
            nn.Dropout(0.2),
            nn.Linear(128, 64),
            nn.ReLU(),
            nn.Linear(64, 1)
        )
        
    def forward(self, x):
        return self.model(x)

# Initialize the model
input_dim = X_train_tensor.shape[1]
model = RentPredictor(input_dim)


In [233]:
import torch.optim as optim

criterion = nn.MSELoss()  # You can also try MAE later
optimizer = optim.Adam(model.parameters(), lr=0.001)


In [235]:
epochs = 1000

for epoch in range(epochs):
    model.train()
    
    # Forward pass
    predictions = model(X_train_tensor)
    loss = criterion(predictions, y_train_tensor)
    
    # Backward pass and optimization
    optimizer.zero_grad()
    loss.backward()
    optimizer.step()
    
    # Validation
    if epoch % 10 == 0:
        model.eval()
        with torch.no_grad():
            val_preds = model(X_val_tensor)
            val_loss = criterion(val_preds, y_val_tensor)
        print(f"Epoch {epoch:3d} | Train Loss: {loss.item():.4f} | Val Loss: {val_loss.item():.4f}")


Epoch   0 | Train Loss: 1268.0234 | Val Loss: 131.8172
Epoch  10 | Train Loss: 474.9239 | Val Loss: 22.1186
Epoch  20 | Train Loss: 239.1296 | Val Loss: 5.7446
Epoch  30 | Train Loss: 155.1377 | Val Loss: 0.7373
Epoch  40 | Train Loss: 93.8348 | Val Loss: 1.1621
Epoch  50 | Train Loss: 62.2409 | Val Loss: 4.8615
Epoch  60 | Train Loss: 44.2145 | Val Loss: 4.6398
Epoch  70 | Train Loss: 32.8464 | Val Loss: 0.7179
Epoch  80 | Train Loss: 25.4522 | Val Loss: 1.3821
Epoch  90 | Train Loss: 19.2289 | Val Loss: 1.6198
Epoch 100 | Train Loss: 15.9392 | Val Loss: 1.2213
Epoch 110 | Train Loss: 13.2205 | Val Loss: 1.0258
Epoch 120 | Train Loss: 11.2090 | Val Loss: 1.1947
Epoch 130 | Train Loss: 9.1086 | Val Loss: 0.9415
Epoch 140 | Train Loss: 8.5141 | Val Loss: 1.0097
Epoch 150 | Train Loss: 7.4775 | Val Loss: 0.8193
Epoch 160 | Train Loss: 6.6836 | Val Loss: 0.8794
Epoch 170 | Train Loss: 6.3092 | Val Loss: 0.8762
Epoch 180 | Train Loss: 5.8371 | Val Loss: 0.8776
Epoch 190 | Train Loss: 4.993

In [236]:
model.eval()
with torch.no_grad():
    final_preds = model(X_val_tensor).squeeze().numpy()


In [237]:
# Denormalize predicted rent
predicted_rent_actual = rent_scaler.inverse_transform(final_preds.reshape(-1, 1))

# (Optional) Also denormalize true values for comparison
true_rent_actual = rent_scaler.inverse_transform(y_val.reshape(-1, 1))


In [238]:
predicted_rent_actual[20:70]

array([[15999.65   ],
       [ 5925.9766 ],
       [10004.662  ],
       [13796.154  ],
       [33020.77   ],
       [15471.885  ],
       [15238.314  ],
       [28730.186  ],
       [21501.283  ],
       [ 5730.912  ],
       [ 7391.8223 ],
       [25824.19   ],
       [16365.051  ],
       [32247.217  ],
       [ 3063.4788 ],
       [ -576.2205 ],
       [18028.791  ],
       [ 7393.3125 ],
       [ 6575.5195 ],
       [ 2560.885  ],
       [ 8288.564  ],
       [16388.363  ],
       [ 5232.342  ],
       [45269.637  ],
       [ 2391.5178 ],
       [  896.25214],
       [15923.756  ],
       [39236.887  ],
       [11416.809  ],
       [16409.145  ],
       [ 8871.     ],
       [11902.643  ],
       [ 4004.76   ],
       [ 4759.7656 ],
       [ 3339.6506 ],
       [ 1096.1583 ],
       [12426.76   ],
       [64982.32   ],
       [ 3042.922  ],
       [ 6080.299  ],
       [-3276.576  ],
       [16695.818  ],
       [ 4404.121  ],
       [60375.867  ],
       [ 6879.7695 ],
       [59

In [239]:
from sklearn.metrics import mean_squared_error, mean_absolute_error

mse = mean_squared_error(y_val, final_preds)
mae = mean_absolute_error(y_val, final_preds)
print(f"Final MSE: {mse:.2f}, MAE: {mae:.2f}")


Final MSE: 0.60, MAE: 0.24


In [39]:
#Denormalized
# After prediction (denormalization)
# predicted_rent = scaler.inverse_transform(predicted_rent_normalized.reshape(-1, 1))
