In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

****

# 

**First lets import our main libraries**

****

# 

**Read the CSV file**

In [2]:
df=pd.read_csv('../input/house-rent-prediction-dataset/House_Rent_Dataset.csv')
df.head()

In [3]:
df.info()

****

# 

**Check if the data has null values**

In [4]:
df.isna().sum()

****

# 

**since all dates are for the year 2022.We will drop the 'Posted On'column**

In [5]:
df.drop(['Posted On'],axis=1,inplace=True)

****

**Check the unique values in all the columns**

In [6]:
categorical_variables=[features for features in df.select_dtypes(include='O')]
categorical_variables.remove('Floor')

for feature in categorical_variables:
    print(f'{feature} has unique values : {df[feature].unique()} ')

****

In [7]:
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.express as px

**Lets check which city has the highest houses for rent and which has the max,min,avg rent**

In [8]:
fig,ax=plt.subplots(figsize=(12,6))
df['City'].value_counts().sort_values().plot(kind='bar',color = "maroon",rot=0,width=0.8)

ax.spines['top'].set_visible(False)
ax.spines['right'].set_visible(False)
ax.spines['left'].set_visible(False)
ax.spines['bottom'].set_color('#DDDDDD')

ax.tick_params(bottom=False, left=False)

ax.set_axisbelow(True)
ax.yaxis.grid(True, color='#EEEEEE')
ax.xaxis.grid(False)
plt.ylabel('Number of houses')
plt.title(' Number of houses on rent across cities',fontsize=15)

fig.tight_layout()

****

# 

**Lets see what is the boxenplot for rent in each city**

In [9]:
fig,ax=plt.subplots(figsize=(15,10))
ax=sns.boxenplot(x='City',y='Rent',data=df)
ax.spines['top'].set_visible(False)
ax.spines['right'].set_visible(False)
ax.spines['left'].set_visible(False)
ax.spines['bottom'].set_color('#DDDDDD')

ax.tick_params(bottom=False, left=False)

ax.set_axisbelow(True)
ax.yaxis.grid(True, color='#EEEEEE')
ax.xaxis.grid(False)
plt.ylabel('Rent')
plt.title('City vs Rent',fontsize=15)
plt.show()

****

# 

**Lets see what is the avg rent in each city**

In [10]:
fig,ax=plt.subplots(figsize=(12,6))
df.groupby('City')['Rent'].mean().sort_values().plot(kind='bar',color = "maroon",rot=0,width=0.8)
ax.spines['top'].set_visible(False)
ax.spines['right'].set_visible(False)
ax.spines['left'].set_visible(False)
ax.spines['bottom'].set_color('#DDDDDD')

ax.tick_params(bottom=False, left=False)

ax.set_axisbelow(True)
ax.yaxis.grid(True, color='#EEEEEE')
ax.xaxis.grid(False)
plt.ylabel('Rent')
plt.title(' Average rent for each city',fontsize=15)


fig.tight_layout()

*Coclusion*
> *Average rent for Mumbai is significantly higher than other cities*

****

# 

**Ok lets check how rent is scattered against size**

In [11]:
fig,ax=plt.subplots(figsize=(12,6))
colors = df["Size"]
plt.scatter(x=df['Size'],y=df['Rent'],c=colors,cmap = 'viridis')

ax.spines['top'].set_visible(False)
ax.spines['right'].set_visible(False)
ax.spines['left'].set_visible(False)
ax.spines['bottom'].set_color('#DDDDDD')
ax.tick_params(bottom=False, left=False)

plt.xlabel('Size')
plt.ylabel('Rent (lakh)')
plt.title('Size of the house vs Rent',fontsize=15)

fig.tight_layout()

In [12]:
sns.displot(df['Size'],bins=20,height=6,aspect=2)
plt.title('Distribution of size',fontsize=15)

*Conclusion*
> *majority of house/apartments on rent have size less than 4000*
> we see 2 distinct outliers where a house with less than 3000size has significantly high rent as compared to others and a house with 8000sqft is available for fairly good renting rate 

****

# 

**Lets see the relationship between average rent and BHK filtered by city**

In [13]:
fig,ax=plt.subplots(figsize=(15,10))
df.groupby(['BHK','City'])['Rent'].mean().sort_values().plot(kind='bar',color='crimson',width=0.6)
ax.spines['top'].set_visible(False)
ax.spines['right'].set_visible(False)
ax.spines['left'].set_visible(False)
ax.spines['bottom'].set_color('#DDDDDD')

ax.tick_params(bottom=False, left=False)

ax.set_axisbelow(True)
ax.yaxis.grid(True, color='#EEEEEE')
ax.xaxis.grid(False)
plt.ylabel('Rent')
plt.title('BHK vs Rent (filtered by city)',fontsize=15)
fig.tight_layout()

****

# 

**Similarly lets check average rent vs no of Bathrooms**

In [14]:
fig,ax=plt.subplots(figsize=(12,6))
df.groupby('Bathroom')['Rent'].mean().sort_values().plot(kind='bar',rot=0,width=0.8)
ax.spines['top'].set_visible(False)
ax.spines['right'].set_visible(False)
ax.spines['left'].set_visible(False)
ax.spines['bottom'].set_color('#DDDDDD')

ax.tick_params(bottom=False, left=False)

ax.set_axisbelow(True)
ax.yaxis.grid(True, color='#EEEEEE')
ax.xaxis.grid(False)
plt.ylabel('Rent')
plt.title('Number of Bathrooms vs Rent',fontsize=15)
fig.tight_layout()

*The above case is because there is only one 10 bathroom and four 7 bathroom apartment on rent whereas there are 60 apartments with 5 bathrooms. it is still surprising nonetheless*

In [15]:
df.loc[df['Bathroom']==10]

********

# 

**Now to check how rent varies across various area types**

In [16]:
fig,ax=plt.subplots(figsize=(12,10))
sns.boxenplot(x='Area Type',y='Rent',data=df)
ax.spines['top'].set_visible(False)
ax.spines['right'].set_visible(False)
ax.spines['left'].set_visible(False)
ax.spines['bottom'].set_color('#DDDDDD')

ax.tick_params(bottom=False, left=False)

ax.set_axisbelow(True)
ax.yaxis.grid(True, color='#EEEEEE')
ax.xaxis.grid(False)
plt.ylabel('Rent')
plt.title('Area Type vs Rent',fontsize=15)
fig.tight_layout()

********

# 

**Now lets check what type of Tenants are preffered across different cities**

In [17]:
df.groupby('City')['Tenant Preferred'].value_counts()

****

**The last two import things we want to see would be**


> 1. What Role does Point of contact plays for rent ?
> 
> 
> 2. Which cities have the max apartments/houses for rent ?

***Part1***

In [19]:

fig,ax=plt.subplots(figsize=(12,6))
df.groupby('Point of Contact')['Rent'].mean().sort_values().plot(kind='bar',rot=0,width=0.8,color='grey')
ax.spines['top'].set_visible(False)
ax.spines['right'].set_visible(False)
ax.spines['left'].set_visible(False)
ax.spines['bottom'].set_color('#DDDDDD')

ax.tick_params(bottom=False, left=False)

ax.set_axisbelow(True)
ax.yaxis.grid(True, color='#EEEEEE')
ax.xaxis.grid(False)
plt.ylabel('Rent')
plt.title('Point of contact vs Rent',fontsize=15)
fig.tight_layout()

*Conclusion*
> *As suspected renting through agents results in higher rental costs by a huge margin*

****

***Part2***

In [20]:
df1=df['Area Locality'].value_counts().sort_values(ascending=False)
fig,ax=plt.subplots(figsize=(12,8))

df1.head(20).sort_values().plot(kind='bar',width=0.8,color='plum')
ax.spines['top'].set_visible(False)
ax.spines['right'].set_visible(False)
ax.spines['left'].set_visible(False)
ax.spines['bottom'].set_color('#DDDDDD')

ax.tick_params(bottom=False, left=False)

ax.set_axisbelow(True)
ax.yaxis.grid(True, color='#EEEEEE')
ax.xaxis.grid(False)
plt.title('Top 20 localities with most houses on rent',fontsize=15)

fig.tight_layout()

****

# 

**For simplicity we will remove the upper and lower basement floors and assign 0 for the ground floor**

In [21]:
df['Floor']=df['Floor'].apply(lambda x:str(x).split(' ')[0])
df=df[(df['Floor']!='Lower') & (df['Floor']!='Upper')]
df['Floor']=df['Floor'].replace(['Ground'],[0])
df['Floor']=pd.to_numeric(df['Floor'],errors='coerce')

In [22]:
matrix=np.triu(df.corr())
plt.figure(figsize=(12,6))
sns.heatmap(df.corr(),annot=True,mask=matrix)
plt.title('Heatmap of correlation between varibles',fontsize=15)
plt.show()

**Conclusion**
*Rent has similar correlation with all the variables. fairly positive*

*There exists some multicollinearity*

**Check for outliers**

In [23]:
numerical_variables=[features for features in df.select_dtypes(exclude='O')]
for features in numerical_variables:
    fig,ax=plt.subplots()
    df.boxplot(column=features)
    plt.title(features)
    plt.show()
    
    fig.tight_layout()

*we will cap the data and then use Robust scaler to deal with the outliers*

In [24]:
#Removing the only row where bathroom = 10
df=df[df['Bathroom'] != 10]

In [25]:
def cap_data(df):
    for col in df.columns:
        print("capping the ",col)
        if (((df[col].dtype)=='float64') | ((df[col].dtype)=='int64')):
            percentiles = df[col].quantile([0.01,0.99]).values
            df[col][df[col] <= percentiles[0]] = percentiles[0]
            df[col][df[col] >= percentiles[1]] = percentiles[1]
        else:
            df[col]=df[col]
    return df

df=cap_data(df)

In [26]:
df.drop('Area Locality',axis=1,inplace=True)
new_df=pd.get_dummies(df,drop_first=True)
new_df.head()

****

# 

**Split the independent variables to X and dependent to Y**

In [27]:
X=new_df.drop('Rent',axis=1)
y=new_df.Rent

****

# 

**Its time to do Train Test Split**

In [28]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

****

# 

**Lets scale the data with Robust scaler**

In [29]:
from sklearn.preprocessing import RobustScaler
ro=RobustScaler()
X_train=ro.fit_transform(X_train)
X_test=ro.transform(X_test)

****

# 

**Using Random Forest Regressor as it gave the highest r2score when trying out different models**

In [30]:
from sklearn.ensemble import RandomForestRegressor

from sklearn.metrics import accuracy_score, mean_squared_error,r2_score
model=RandomForestRegressor()
model.fit(X_train,y_train)

y_pred=model.predict(X_test)
print(f'r2 score is {r2_score(y_test,y_pred)}')
print(f'MSE score is {mean_squared_error(y_test,y_pred)}')
print(f'RMSE score is {np.sqrt(mean_squared_error(y_test,y_pred))}')
      

**Our model is not giving high accuracy further improvements will be needed. Hyperparameter tuning of the model did not yield any significantly better results**