# Data From: [house-rent-prediction-dataset](https://www.kaggle.com/rkb0023/houserentpredictiondataset)


In [2]:
# Imports.
import json
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import spacy
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV
pd.pandas.set_option("display.max_columns", None)

In [3]:
JSON_FILE_NAME = "data.json" # This file will contain numerical encoding for `object` dtype columns in the dataset.

In [4]:
df = pd.read_csv(r"C:\Users\kids\PycharmProjects\MachineLearningProjects\HouseRentPricePrediction\housing_train.csv") # Loading the data.

In [5]:
df.head()


Unnamed: 0,id,url,region,region_url,price,type,sqfeet,beds,baths,cats_allowed,dogs_allowed,smoking_allowed,wheelchair_access,electric_vehicle_charge,comes_furnished,laundry_options,parking_options,image_url,description,lat,long,state
0,7039061606,https://bham.craigslist.org/apa/d/birmingham-h...,birmingham,https://bham.craigslist.org,1195,apartment,1908,3,2.0,1,1,1,0,0,0,laundry on site,street parking,https://images.craigslist.org/00L0L_80pNkyDeG0...,Apartments In Birmingham AL Welcome to 100 Inv...,33.4226,-86.7065,al
1,7041970863,https://bham.craigslist.org/apa/d/birmingham-w...,birmingham,https://bham.craigslist.org,1120,apartment,1319,3,2.0,1,1,1,0,0,0,laundry on site,off-street parking,https://images.craigslist.org/00707_uRrY9CsNMC...,Find Your Way to Haven Apartment Homes Come ho...,33.3755,-86.8045,al
2,7041966914,https://bham.craigslist.org/apa/d/birmingham-g...,birmingham,https://bham.craigslist.org,825,apartment,1133,1,1.5,1,1,1,0,0,0,laundry on site,street parking,https://images.craigslist.org/00h0h_b7Bdj1NLBi...,Apartments In Birmingham AL Welcome to 100 Inv...,33.4226,-86.7065,al
3,7041966936,https://bham.craigslist.org/apa/d/birmingham-f...,birmingham,https://bham.craigslist.org,800,apartment,927,1,1.0,1,1,1,0,0,0,laundry on site,street parking,https://images.craigslist.org/00808_6ghZ8tSRQs...,Apartments In Birmingham AL Welcome to 100 Inv...,33.4226,-86.7065,al
4,7041966888,https://bham.craigslist.org/apa/d/birmingham-2...,birmingham,https://bham.craigslist.org,785,apartment,1047,2,1.0,1,1,1,0,0,0,laundry on site,street parking,https://images.craigslist.org/00y0y_21c0FOvUXm...,Apartments In Birmingham AL Welcome to 100 Inv...,33.4226,-86.7065,al


The columns 'id', `url`, `region_url` and `image_url` are useless columns, hence we will drop them.

In [6]:
USELESS_COLUMNS = ["id", "url", "region_url", "image_url"]
df.drop(USELESS_COLUMNS, axis=1, inplace=True) # Dropping useless columns

In [7]:
df.head()


Unnamed: 0,region,price,type,sqfeet,beds,baths,cats_allowed,dogs_allowed,smoking_allowed,wheelchair_access,electric_vehicle_charge,comes_furnished,laundry_options,parking_options,description,lat,long,state
0,birmingham,1195,apartment,1908,3,2.0,1,1,1,0,0,0,laundry on site,street parking,Apartments In Birmingham AL Welcome to 100 Inv...,33.4226,-86.7065,al
1,birmingham,1120,apartment,1319,3,2.0,1,1,1,0,0,0,laundry on site,off-street parking,Find Your Way to Haven Apartment Homes Come ho...,33.3755,-86.8045,al
2,birmingham,825,apartment,1133,1,1.5,1,1,1,0,0,0,laundry on site,street parking,Apartments In Birmingham AL Welcome to 100 Inv...,33.4226,-86.7065,al
3,birmingham,800,apartment,927,1,1.0,1,1,1,0,0,0,laundry on site,street parking,Apartments In Birmingham AL Welcome to 100 Inv...,33.4226,-86.7065,al
4,birmingham,785,apartment,1047,2,1.0,1,1,1,0,0,0,laundry on site,street parking,Apartments In Birmingham AL Welcome to 100 Inv...,33.4226,-86.7065,al


# Exploratory Data Analysis

In [8]:
df.dtypes


region                      object
price                        int64
type                        object
sqfeet                       int64
beds                         int64
baths                      float64
cats_allowed                 int64
dogs_allowed                 int64
smoking_allowed              int64
wheelchair_access            int64
electric_vehicle_charge      int64
comes_furnished              int64
laundry_options             object
parking_options             object
description                 object
lat                        float64
long                       float64
state                       object
dtype: object

# Preprocessing the data

In [None]:
def dump_data(data: dict):
    """
    This function dumps `data` into a JSON file(`data.json`).
    :param data: Data to dump into a JSON file. The keys in `data` are values from one of the
        `object` dtype columns in `df`. The values in `data` are the indexes of the keys. In the app,
        users will insert new data that is not preprocessed. Therefore, we must save `data` to encode new data.
    """
    with open(JSON_FILE_NAME, 'w') as f:
        json.dump(data, f)


def get_columns_to_encode(df: pd.DataFrame):
    """
    This function dumps all `object` dtype columns in `df` to a JSON file.
    :param df: pd.DataFrame with columns to encode
    """
    for column in df.columns:
        if df[column].dtype == "O": # If the column should be encoded.
            column_classes = df[column].unique()
            column_classes_labeled = {key : value for key, value in zip(column_classes, range(len(column_classes)))}
            dump_data(column_classes_labeled)


def encode_data(df: pd.DataFrame, save_encoders: bool) -> pd.DataFrame:
    """
    This function encodes all `object` dtype columns in `df`.
    :param df: pd.DataFrame with columns to encode.
    :param save_encoders: Whether to save the encoder for each column as a file or not.
        Saving the encoder as a file is essential to process new data.
    :return: pd.DataFrame with encoded columns.
    """
    for column in df.columns:
        if df[column].dtype == "O": # If the column should be encoded.
            encoder = LabelEncoder()
            df[column] = encoder.fit_transform(df[column])

            if save_encoders:
                np.save(f"{column}.npy", encoder.classes_)
    return df