# House Rent Prediction Dataset

In [7]:
# Importing necessary libraries
import pandas as pd
import numpy as np

# Load the dataset
data = pd.read_csv('houseRent.csv')

# Display the first few rows of the dataset to get an overview
print("First 5 rows of the dataset:")
print(data.head())

# Check the information of the dataset (data types, non-null counts)
print("\nDataset Info:")
print(data.info())

# Get the statistical summary of the dataset for numerical columns
print("\nStatistical Summary:")
print(data.describe())

# Check for missing values in the dataset
print("\nMissing Values:")
print(data.isnull().sum())

First 5 rows of the dataset:
   BHK   Rent  Size    Area Type             Area Locality     City  \
0    2  10000  1100   Super Area                    Bandel  Kolkata   
1    2  20000   800   Super Area  Phool Bagan, Kankurgachi  Kolkata   
2    2  17000  1000   Super Area   Salt Lake City Sector 2  Kolkata   
3    2  10000   800   Super Area               Dumdum Park  Kolkata   
4    2   7500   850  Carpet Area             South Dum Dum  Kolkata   

  Furnishing Status  Bathroom Point of Contact  
0       Unfurnished         2    Contact Owner  
1    Semi-Furnished         1    Contact Owner  
2    Semi-Furnished         1    Contact Owner  
3       Unfurnished         1    Contact Owner  
4       Unfurnished         1    Contact Owner  

Dataset Info:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4746 entries, 0 to 4745
Data columns (total 9 columns):
 #   Column             Non-Null Count  Dtype 
---  ------             --------------  ----- 
 0   BHK                4746 non-nu

### Encoding Categorical Attributes

Since there are no missing values, we can proceed to encode the categorical attributes (i.e., those with `object` data types) so that they can be used in machine learning models. The categorical attributes in our dataset are:

1. **Area Type**: Categorical with values like `"Super Area"`, `"Carpet Area"`, etc.
2. **Area Locality**: Categorical, each representing a specific locality.
3. **City**: Categorical with names of cities.
4. **Furnishing Status**: Categorical with values like `"Furnished"`, `"Unfurnished"`, etc.
5. **Point of Contact**: Categorical, usually indicating who to contact. This column might not be important for prediction and can be dropped if irrelevant.

To encode the categorical features, we will use:

- **Label Encoding** for attributes with a small number of categories (like `Area Type` and `Furnishing Status`). This will convert categorical values into numerical labels, which is suitable when there are limited and distinct categories.

- **One-Hot Encoding** for attributes with a larger number of unique categories (like `Area Locality` and `City`). This will create binary (0 or 1) columns for each category, which is useful to avoid imposing any ordinal relationship between categories.

#### Steps:
1. Use `Label Encoding` for:
   - `Area Type`
   - `Furnishing Status`
   
2. Use `One-Hot Encoding` for:
   - `Area Locality`
   - `City`

3. The column `Point of Contact` will be dropped, assuming it does not provide useful information for predicting the rent.


In [8]:
from sklearn.preprocessing import LabelEncoder
import pandas as pd

# Label encoding for 'Area Type' and 'Furnishing Status'
label_encoder = LabelEncoder()
data['Area Type'] = label_encoder.fit_transform(data['Area Type'])
data['Furnishing Status'] = label_encoder.fit_transform(data['Furnishing Status'])

# One-hot encoding for 'Area Locality' and 'City'
data = pd.get_dummies(data, columns=['Area Locality', 'City'], drop_first=True)

# Drop 'Point of Contact' as it may not be useful for the model
data = data.drop('Point of Contact', axis=1)

# Display the first few rows after encoding
print("Data after encoding:")
print(data.head())

Data after encoding:
   BHK   Rent  Size  Area Type  Furnishing Status  Bathroom  \
0    2  10000  1100          2                  2         2   
1    2  20000   800          2                  1         1   
2    2  17000  1000          2                  1         1   
3    2  10000   800          2                  2         1   
4    2   7500   850          1                  2         1   

   Area Locality_ in Boduppal, NH 2 2  Area Locality_ in Erragadda, NH 9  \
0                               False                              False   
1                               False                              False   
2                               False                              False   
3                               False                              False   
4                               False                              False   

   Area Locality_ in Miyapur, NH 9  Area Locality_117 Residency, Chembur East  \
0                            False                            

In [11]:
# Convert all boolean values to 0 and 1
data = data.astype(int)

# Display the first few rows to confirm the change
print("Data after converting boolean values to 0s and 1s:")
print(data.head())


Data after converting boolean values to 0s and 1s:
   BHK   Rent  Size  Area Type  Furnishing Status  Bathroom  \
0    2  10000  1100          2                  2         2   
1    2  20000   800          2                  1         1   
2    2  17000  1000          2                  1         1   
3    2  10000   800          2                  2         1   
4    2   7500   850          1                  2         1   

   Area Locality_ in Boduppal, NH 2 2  Area Locality_ in Erragadda, NH 9  \
0                                   0                                  0   
1                                   0                                  0   
2                                   0                                  0   
3                                   0                                  0   
4                                   0                                  0   

   Area Locality_ in Miyapur, NH 9  Area Locality_117 Residency, Chembur East  \
0                               

In this section, we will use **Ensemble Methods** to predict house rent. Ensemble methods combine multiple models to improve the performance of the predictive model. There are two primary types of ensemble methods we will be using:

### 1. Bagging
Bagging (Bootstrap Aggregating) involves training multiple versions of a model on different subsets of the training data (by sampling with replacement) and averaging their predictions. This reduces variance and prevents overfitting. We will use **Random Forest** for Bagging.

### 2. Boosting
Boosting is an iterative method that adjusts the weights of observations based on the previous model's errors. It trains a sequence of models, with each new model focusing on correcting the errors made by the previous one. We will use **AdaBoost** and **XGBoost** for Boosting.

## Model Implementations:

- **RandomForestRegressor**: A bagging technique that combines multiple decision trees to reduce variance.
- **AdaBoostRegressor**: A boosting technique that combines weak learners (e.g., decision trees) by giving more weight to hard-to-predict instances.
- **XGBRegressor**: A boosting algorithm that builds strong predictive models by sequentially correcting errors from weak models, with additional regularization to reduce overfitting.

## Steps Involved:

1. **Import Libraries**:
    - We import the necessary libraries for implementing the models and evaluation metrics.

2. **Initialize Models**:
    - **RandomForestRegressor**: A bagging model to reduce variance.
    - **AdaBoostRegressor**: A boosting model to iteratively improve predictions.
    - **XGBRegressor**: A scalable and efficient gradient boosting model.

3. **Evaluate Models**:
    - We define a function `evaluate_model` to train and test each model, compute predictions, and evaluate the model's performance using metrics like **MSE**, **RMSE**, and **R² Score**.

4. **Training and Testing**:
    - For each model, we train the model on the training set (`X_train`, `y_train`) and make predictions on the test set (`X_test`).
    - We then evaluate the model using **Mean Squared Error (MSE)**, **Root Mean Squared Error (RMSE)**, and **R² Score**.


In [12]:
# Import necessary libraries for train-test split and model training
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, r2_score

# Splitting the data into features (X) and target (y)
X = data.drop('Rent', axis=1)  # All columns except 'Rent' as features
y = data['Rent']               # 'Rent' as the target variable

# Splitting the data into training and testing sets (80% train, 20% test)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Display the shapes of the train and test sets
print("Training set shape (X_train, y_train):", X_train.shape, y_train.shape)
print("Testing set shape (X_test, y_test):", X_test.shape, y_test.shape)

# Initialize a simple Linear Regression model
linear_model = LinearRegression()

# Fit the model on the training data
linear_model.fit(X_train, y_train)

# Make predictions on the testing data
y_pred = linear_model.predict(X_test)

# Evaluate the model performance
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
r2 = r2_score(y_test, y_pred)

# Display the evaluation metrics
print("\nModel Performance:")
print(f"Mean Squared Error (MSE): {mse:.2f}")
print(f"Root Mean Squared Error (RMSE): {rmse:.2f}")
print(f"R-squared (R² Score): {r2:.2f}")


Training set shape (X_train, y_train): (3796, 2244) (3796,)
Testing set shape (X_test, y_test): (950, 2244) (950,)

Model Performance:
Mean Squared Error (MSE): 134395396930495234179072.00
Root Mean Squared Error (RMSE): 366599777591.99
R-squared (R² Score): -33722092375797.36


In [15]:
# Import necessary libraries
from sklearn.ensemble import RandomForestRegressor, AdaBoostRegressor
from xgboost import XGBRegressor
from sklearn.metrics import mean_squared_error, r2_score

# Initialize models
random_forest_model = RandomForestRegressor(n_estimators=100, random_state=42)  # Bagging
adaboost_model = AdaBoostRegressor(n_estimators=100, random_state=42)           # Boosting with AdaBoost
xgboost_model = XGBRegressor(n_estimators=100, random_state=42)                 # Boosting with XGBoost

# Dictionary to store models
models = {
    "Random Forest": random_forest_model,
    "AdaBoost": adaboost_model,
    "XGBoost": xgboost_model
}

# Function to train, predict, and evaluate a model
def evaluate_model(model, X_train, y_train, X_test, y_test):
    # Train the model
    model.fit(X_train, y_train)
    
    # Predict on the test set
    y_pred = model.predict(X_test)
    
    # Evaluation Metrics
    mse = mean_squared_error(y_test, y_pred)
    rmse = np.sqrt(mse)
    r2 = r2_score(y_test, y_pred)
    
    # Display the results
    print(f"\nModel: {model.__class__.__name__}")
    print(f"Mean Squared Error (MSE): {mse:.2f}")
    print(f"Root Mean Squared Error (RMSE): {rmse:.2f}")
    print(f"R-squared (R² Score): {r2:.2f}")
    
    return mse, rmse, r2

# Train and evaluate each model
for model_name, model in models.items():
    print(f"\nTraining and Evaluating {model_name} Model:")
    evaluate_model(model, X_train, y_train, X_test, y_test)



Training and Evaluating Random Forest Model:

Model: RandomForestRegressor
Mean Squared Error (MSE): 1430543179.06
Root Mean Squared Error (RMSE): 37822.52
R-squared (R² Score): 0.64

Training and Evaluating AdaBoost Model:

Model: AdaBoostRegressor
Mean Squared Error (MSE): 2567255419.10
Root Mean Squared Error (RMSE): 50668.09
R-squared (R² Score): 0.36

Training and Evaluating XGBoost Model:

Model: XGBRegressor
Mean Squared Error (MSE): 986108210.23
Root Mean Squared Error (RMSE): 31402.36
R-squared (R² Score): 0.75


In [18]:
# Import necessary libraries
from sklearn.ensemble import RandomForestRegressor, AdaBoostRegressor
from sklearn.tree import DecisionTreeRegressor
from xgboost import XGBRegressor
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.model_selection import train_test_split

# Initialize models with improvements for AdaBoost
random_forest_model = RandomForestRegressor(n_estimators=100, random_state=42)  # Bagging
adaboost_model = AdaBoostRegressor(
    n_estimators=300,  # Increased number of estimators
    learning_rate=0.05,  # Smaller learning rate for gradual learning
    random_state=42)  # Boosting with AdaBoost
xgboost_model = XGBRegressor(n_estimators=100, random_state=42)  # Boosting with XGBoost

# Dictionary to store models
models = {
    "Random Forest": random_forest_model,
    "AdaBoost": adaboost_model,
    "XGBoost": xgboost_model
}

# Function to train, predict, and evaluate a model
def evaluate_model(model, X_train, y_train, X_test, y_test):
    # Train the model
    model.fit(X_train, y_train)
    
    # Predict on the test set
    y_pred = model.predict(X_test)
    
    # Evaluation Metrics
    mse = mean_squared_error(y_test, y_pred)
    rmse = np.sqrt(mse)
    r2 = r2_score(y_test, y_pred)
    
    # Display the results
    print(f"\nModel: {model.__class__.__name__}")
    print(f"Mean Squared Error (MSE): {mse:.2f}")
    print(f"Root Mean Squared Error (RMSE): {rmse:.2f}")
    print(f"R-squared (R² Score): {r2:.2f}")
    
    return mse, rmse, r2

# Train and evaluate each model
for model_name, model in models.items():
    print(f"\nTraining and Evaluating {model_name} Model:")
    evaluate_model(model, X_train, y_train, X_test, y_test)



Training and Evaluating Random Forest Model:

Model: RandomForestRegressor
Mean Squared Error (MSE): 1430543179.06
Root Mean Squared Error (RMSE): 37822.52
R-squared (R² Score): 0.64

Training and Evaluating AdaBoost Model:

Model: AdaBoostRegressor
Mean Squared Error (MSE): 1903468616.08
Root Mean Squared Error (RMSE): 43628.76
R-squared (R² Score): 0.52

Training and Evaluating XGBoost Model:

Model: XGBRegressor
Mean Squared Error (MSE): 986108210.23
Root Mean Squared Error (RMSE): 31402.36
R-squared (R² Score): 0.75
