In [1]:
import pandas as pd
import numpy as np
import math
from matplotlib import pyplot as plt 
%matplotlib inline
import matplotlib
matplotlib.rcParams["figure.figsize"]= (20,10)

In [2]:
df = pd.read_csv("train.csv")

In [3]:
df = df[['bedroom','bathrooms','area','furnishing','avalable_for','rent']]

In [4]:
df.head()

Unnamed: 0,bedroom,bathrooms,area,furnishing,avalable_for,rent
0,2,2,1050.0,Unfurnished,All,20000.0
1,2,2,760.0,Unfurnished,All,14000.0
2,3,3,0.0,Semifurnished,All,22999.0
3,1,1,628.0,Furnished,Family Only,13000.0
4,2,2,668.0,Semifurnished,"Family , Bachelors (Men Only)",7500.0


In [5]:
df = df.rename({"bedroom": "bedrooms"}, axis=1)
df = df.rename({"avalable_for": "available_for"}, axis=1)
df.size

65304

In [6]:
df = df.dropna()
df.isnull().sum()

bedrooms         0
bathrooms        0
area             0
furnishing       0
available_for    0
rent             0
dtype: int64

In [7]:
median = df.area[df.area != 0].median()  # Calculate the median of non-zero values
df.area = df.area.replace(0, median) 

In [8]:
df = df[df['furnishing'] != 'Unfurnishe']

In [9]:
df["available_for"].unique()

array(['All', 'Family Only', 'Family , Bachelors (Men Only)',
       'Family , Bachelors (Women Only)', 'Bachelors (Men/Women)',
       'Bachelors (Women Only)', 'Bachelors (Men Only)'], dtype=object)

In [10]:
def clean(x):
    if 'Family ' in x:
        return 'Family'
    if 'Bachelors' in x:
        return 'Bachelors'
    return x

df['available_for'] = df['available_for'].apply(clean)

In [11]:
df["available_for"].unique()

array(['All', 'Family', 'Bachelors'], dtype=object)

In [12]:
df["bedrooms"].unique()

array([ 2,  3,  1,  4, 22, 10,  5,  7,  6, 20], dtype=int64)

In [13]:
df = df[df["bedrooms"] <= 9]

In [14]:
df["bedrooms"].unique()

array([2, 3, 1, 4, 5, 7, 6], dtype=int64)

In [15]:
df["bathrooms"].unique()

array([2, 3, 1, 4, 5, 6, 7], dtype=int64)

In [16]:
df

Unnamed: 0,bedrooms,bathrooms,area,furnishing,available_for,rent
0,2,2,1050.0,Unfurnished,All,20000.0
1,2,2,760.0,Unfurnished,All,14000.0
2,3,3,750.0,Semifurnished,All,22999.0
3,1,1,628.0,Furnished,Family,13000.0
4,2,2,668.0,Semifurnished,Family,7500.0
...,...,...,...,...,...,...
10879,2,1,750.0,Unfurnished,Family,15000.0
10880,2,1,750.0,Unfurnished,All,11000.0
10881,3,4,2390.0,Unfurnished,Family,55000.0
10882,2,2,563.0,Semifurnished,Family,14000.0


In [17]:
df['price_per_sqft'] = df['rent']/df['area']

In [18]:
df

Unnamed: 0,bedrooms,bathrooms,area,furnishing,available_for,rent,price_per_sqft
0,2,2,1050.0,Unfurnished,All,20000.0,19.047619
1,2,2,760.0,Unfurnished,All,14000.0,18.421053
2,3,3,750.0,Semifurnished,All,22999.0,30.665333
3,1,1,628.0,Furnished,Family,13000.0,20.700637
4,2,2,668.0,Semifurnished,Family,7500.0,11.227545
...,...,...,...,...,...,...,...
10879,2,1,750.0,Unfurnished,Family,15000.0,20.000000
10880,2,1,750.0,Unfurnished,All,11000.0,14.666667
10881,3,4,2390.0,Unfurnished,Family,55000.0,23.012552
10882,2,2,563.0,Semifurnished,Family,14000.0,24.866785


In [19]:
df.shape

(10847, 7)

In [20]:
df = df[~(df.area/df.bedrooms<300)]

In [21]:
df.shape

(9855, 7)

In [22]:
df.price_per_sqft.describe()

count      9855.000000
mean         37.114364
std        1657.968462
min           0.178633
25%          15.333333
50%          20.000000
75%          24.691358
max      164609.052000
Name: price_per_sqft, dtype: float64

In [23]:
df = df[~(df.price_per_sqft<10)]
df = df[~(df.price_per_sqft>70)]
df.price_per_sqft.describe()

count    9216.000000
mean       21.265959
std         7.021129
min        10.000000
25%        16.159674
50%        20.000000
75%        25.000000
max        66.666667
Name: price_per_sqft, dtype: float64

In [24]:
df

Unnamed: 0,bedrooms,bathrooms,area,furnishing,available_for,rent,price_per_sqft
0,2,2,1050.0,Unfurnished,All,20000.0,19.047619
1,2,2,760.0,Unfurnished,All,14000.0,18.421053
3,1,1,628.0,Furnished,Family,13000.0,20.700637
4,2,2,668.0,Semifurnished,Family,7500.0,11.227545
5,2,2,950.0,Semifurnished,Family,17000.0,17.894737
...,...,...,...,...,...,...,...
10878,2,1,900.0,Unfurnished,Family,15000.0,16.666667
10879,2,1,750.0,Unfurnished,Family,15000.0,20.000000
10880,2,1,750.0,Unfurnished,All,11000.0,14.666667
10881,3,4,2390.0,Unfurnished,Family,55000.0,23.012552


In [25]:
from sklearn.preprocessing import LabelEncoder
le_furnishing = LabelEncoder()
df['furnishing'] = le_furnishing.fit_transform(df['furnishing'])
df["furnishing"].unique()

array([2, 0, 1])

In [26]:
df

Unnamed: 0,bedrooms,bathrooms,area,furnishing,available_for,rent,price_per_sqft
0,2,2,1050.0,2,All,20000.0,19.047619
1,2,2,760.0,2,All,14000.0,18.421053
3,1,1,628.0,0,Family,13000.0,20.700637
4,2,2,668.0,1,Family,7500.0,11.227545
5,2,2,950.0,1,Family,17000.0,17.894737
...,...,...,...,...,...,...,...
10878,2,1,900.0,2,Family,15000.0,16.666667
10879,2,1,750.0,2,Family,15000.0,20.000000
10880,2,1,750.0,2,All,11000.0,14.666667
10881,3,4,2390.0,2,Family,55000.0,23.012552


In [27]:
from sklearn.preprocessing import LabelEncoder
le_available_for = LabelEncoder()
df['available_for'] = le_available_for.fit_transform(df['available_for'])
df["available_for"].unique()

array([0, 2, 1])

In [28]:
df = df.drop('price_per_sqft',axis='columns')

In [29]:
df

Unnamed: 0,bedrooms,bathrooms,area,furnishing,available_for,rent
0,2,2,1050.0,2,0,20000.0
1,2,2,760.0,2,0,14000.0
3,1,1,628.0,0,2,13000.0
4,2,2,668.0,1,2,7500.0
5,2,2,950.0,1,2,17000.0
...,...,...,...,...,...,...
10878,2,1,900.0,2,2,15000.0
10879,2,1,750.0,2,2,15000.0
10880,2,1,750.0,2,0,11000.0
10881,3,4,2390.0,2,2,55000.0


In [30]:
x = df.drop('rent', axis=1)
x.head()

Unnamed: 0,bedrooms,bathrooms,area,furnishing,available_for
0,2,2,1050.0,2,0
1,2,2,760.0,2,0
3,1,1,628.0,0,2
4,2,2,668.0,1,2
5,2,2,950.0,1,2


In [31]:
y = df.rent
y.head()

0    20000.0
1    14000.0
3    13000.0
4     7500.0
5    17000.0
Name: rent, dtype: float64

In [32]:
from sklearn.model_selection import train_test_split
xtrain,xtest,ytrain,ytest = train_test_split(x,y,test_size=.2)

In [33]:
from sklearn.linear_model import LinearRegression
lr = LinearRegression()
lr.fit(xtrain,ytrain)
lr.score(xtest,ytest)

0.6625341663807489

In [34]:
from sklearn.model_selection import ShuffleSplit
from sklearn.model_selection import cross_val_score

cv = ShuffleSplit(n_splits=5, test_size=.2, random_state=0)

cross_val_score(LinearRegression(),x,y,cv=cv)

array([0.61915589, 0.63769454, 0.66999617, 0.65633781, 0.63400562])

In [35]:
from sklearn.tree import DecisionTreeRegressor
dt = DecisionTreeRegressor()
dt.fit(xtrain,ytrain)
dt.score(xtest,ytest)


0.48128528202144094

In [36]:
from sklearn.model_selection import ShuffleSplit
from sklearn.model_selection import cross_val_score

cv = ShuffleSplit(n_splits=5, test_size=.2, random_state=0)

cross_val_score(DecisionTreeRegressor(),x,y,cv=cv)

array([0.58831115, 0.56169215, 0.53966146, 0.54045619, 0.52292202])

In [37]:
from sklearn.ensemble import RandomForestRegressor
rf = RandomForestRegressor()
rf.fit(xtrain,ytrain)
rf.score(xtest,ytest)

0.6071563975754691

In [38]:
from sklearn.model_selection import ShuffleSplit
from sklearn.model_selection import cross_val_score

cv = ShuffleSplit(n_splits=5, test_size=.2, random_state=0)

cross_val_score(RandomForestRegressor(),x,y,cv=cv)

array([0.63447591, 0.6097417 , 0.62523461, 0.63803172, 0.59194642])

In [39]:
x.columns

Index(['bedrooms', 'bathrooms', 'area', 'furnishing', 'available_for'], dtype='object')

In [40]:
rf.predict([[2,2,1000,2,0]])



array([20531.698495])

In [42]:
import pickle
data = {"model": rf, "le_furnishing": le_furnishing, "le_available_for": le_available_for}
with open('Model.pkl', 'wb') as file:
    pickle.dump(data, file)

In [43]:
with open('Model.pkl', 'rb') as file:
    data = pickle.load(file)

forest_loaded = data["model"]
le_furnishing = data["le_furnishing"]
le_available_for = data["le_available_for"]