### imports

In [86]:
import pandas as pd
import numpy as np

### params

In [87]:
FORMATTED_NAME = 'Formatted_Data.csv'

In [88]:
ATTRIBUTES = [
    "BHK",
    "Rent",
    "Size",
    "Area Type",
    "City",
    "Furnishing Status",
    "Tenant Preferred",
    "Bathroom",
    # "Point of Contact",
]

In [89]:
VALUE_MAP = {
    'Furnishing Status': ['Semi-Furnished', 'Unfurnished', 'Furnished'],
    'Area Type': ['Super Area', 'Carpet Area', 'Built Area'],
    'City': ['Mumbai', 'Chennai', 'Bangalore', 'Hyderabad', 'Delhi', 'Kolkata'],
    'Tenant Preferred': ['Bachelors/Family', 'Bachelors', 'Family'],
    'Point of Contact': ['Contact Owner', 'Contact Agent', 'Contact Builder'],
}

In [90]:
LOG_NORMALIZATION = [
    "Rent",
    "Size",
]


### prepare dataset

In [91]:
df = pd.read_csv("House_Rent_Dataset.csv", low_memory=False)

In [92]:
df = df.sample(frac=1).reset_index(drop=True)

### format values

In [93]:
formatted_data = []
for index, row in df.iterrows():
    formatted_row = []
    for column in ATTRIBUTES:
        raw_value = row[column]
        if column in VALUE_MAP:
            formatted_values = [
                1 if raw_value == attribute else 0 
                for attribute
                in VALUE_MAP[column]
            ]
            formatted_row.extend(formatted_values)
        else:
            formatted_row.append(raw_value)
    formatted_data.append(formatted_row)

In [94]:
columns = []

for column in ATTRIBUTES:
    if column in VALUE_MAP:
        columns.extend(VALUE_MAP[column])
    else:
        columns.append(column)

In [95]:
formatted_df = pd.DataFrame(formatted_data, columns=columns)

### normalize data

In [96]:
for column in LOG_NORMALIZATION:
    formatted_df[column] = np.log(formatted_df[column])

In [97]:
formatted_df.to_csv(FORMATTED_NAME, encoding='utf-8', index=False)