In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression  
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.svm import SVR, LinearSVC  
from sklearn.feature_selection import RFE
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
import matplotlib.pyplot as plt
import warnings
warnings.filterwarnings("ignore")

In [2]:
dataset1=pd.read_csv("preprocessed_house_rent.csv",index_col=None)
df2=dataset1
df2 = pd.get_dummies(df2, drop_first=True,dtype=int)
indep_X=df2.drop('Total Floors',axis=1)
dep_Y=df2['Total Floors']

In [3]:
df2

Unnamed: 0,BHK,Rent,Size,Area Type,Furnishing Status,Tenant Preferred,Bathroom,City_Chennai,City_Delhi,City_Hyderabad,City_Kolkata,City_Mumbai,Floor Number,Total Floors
0,2,10000,1100,2,2,1,2,0,0,0,1,0,0,2
1,2,20000,800,2,1,1,1,0,0,0,1,0,1,3
2,2,17000,1000,2,1,1,1,0,0,0,1,0,1,3
3,2,10000,800,2,2,1,1,0,0,0,1,0,1,2
4,2,7500,850,1,2,0,1,0,0,0,1,0,1,2
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4741,2,15000,1000,1,1,1,2,0,0,1,0,0,3,5
4742,3,29000,2000,2,1,1,3,0,0,1,0,0,1,4
4743,3,35000,1750,1,1,1,3,0,0,1,0,0,3,5
4744,3,45000,1500,1,1,2,2,0,0,1,0,0,23,34


In [4]:
# Define independent variables (X) and dependent variable (y)
X = df2.drop(columns=['Rent'])  # Independent features
y = df2['Rent']  # Dependent variable (Rent)


In [5]:
# Split data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


In [6]:
#Models to apply RFE with
models = {
    "Linear Regression": LinearRegression(),
    "Decision Tree Regressor": DecisionTreeRegressor(random_state=42),
    "Random Forest Regressor": RandomForestRegressor(n_estimators=100, random_state=42),
    "Support Vector Machine (SVR)": SVR(kernel='linear'),  # Using linear kernel
    "LinearSVC": LinearSVC()  # Adding LinearSVC as an alternative for feature selection
}

In [7]:
# Dictionary to store the performance metrics
performance_metrics = {
    'Model': [],
    'MAE': [],
    'MSE': [],
    'RMSE': [],
    'R2': []
}


In [8]:
# Evaluate each model
for model_name, model in models.items():
    # Fit the model
    model.fit(X_train, y_train)
    
    # Predict on the test set
    y_pred = model.predict(X_test)
    
    # Calculate the metrics
    mae = mean_absolute_error(y_test, y_pred)
    mse = mean_squared_error(y_test, y_pred)
    rmse = np.sqrt(mse)  # RMSE is the square root of MSE
    r2 = r2_score(y_test, y_pred)
    # Store the results
    performance_metrics['Model'].append(model_name)
    performance_metrics['MAE'].append(mae)
    performance_metrics['MSE'].append(mse)
    performance_metrics['RMSE'].append(rmse)
    performance_metrics['R2'].append(r2)

#Convert performance metrics to a DataFrame for better visualization
performance_df = pd.DataFrame(performance_metrics)

#Display the performance comparison
print(performance_df)
    

                          Model           MAE           MSE           RMSE  \
0             Linear Regression  21886.559391  1.914475e+09   43754.716266   
1       Decision Tree Regressor  15120.063860  2.480562e+09   49805.244170   
2       Random Forest Regressor  13574.277526  2.235999e+09   47286.345887   
3  Support Vector Machine (SVR)  17665.822901  2.881529e+09   53679.877843   
4                     LinearSVC  26443.795789  2.901641e+10  170342.035205   

         R2  
0  0.519626  
1  0.377585  
2  0.438950  
3  0.276975  
4 -6.280711  
