In [14]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

In [15]:
data = pd.read_csv("extra_data.csv")

## Column description
- `MlsNumber` - property id
- `LivingArea` - meterage, in variuos units (sad)
- `YearBuilt` - year built, small amount oh nans
- `MunicipalTax` - a tax paid to local authorities for a building
- `SchoolTax` - a tax paid to local school for education
- `CondoFees` - a levy paid to cover maintenance costs (czynsz) - alot of nans
- `HeatingEnergy` - type of energy used to heat the house
- `HeatingSystem` - irrelevant (lot of nans also)
- `Proximity` - facilities in neighbourhood
- `SewageSystem` - the type of system used to siphon off sewage 
- `LotAssessment` - building assessment made by auctionists on auction (lot of nans)
- `BuildingAssessment` - lots of nans
- `Pool` - type of pool (lots of nans, but it may indicate absence of pool)
- `View`- type of view, e.g. panoramic, water (lots of nans)
- `Garage` - type of garage
- `Parking` - type of parking
- `Zoning` - zoning 
- `Ąrea`- area of property (hose + facilities, lots of nans)
- `WaterSupply` - type of water supply
- `LotDepth`, `Lotfrontage`, `FloorCovering`, `Topography`, `Insurance`,  - drop (lots of nans)
- `MuniciaplAssessment` - an assessment made by local authorities of building price

In [16]:
data = data[['MlsNumber', 'LivingArea', 'Proximity']]
data.dropna(inplace = True)

Dealing with `LivingArea` feature.

In [17]:
area = data['LivingArea'].str.split(' ', n = 1, expand=True)
area[0] = area[0].str.replace(',', '')
data['AreaNumber'] = area[0].astype(float)
data['AreaUnits'] = area[1]
data.drop('LivingArea', axis = 1, inplace=True)

In [18]:
# dealing with square feet
data['AreaNumber'] = np.where(data['AreaUnits'].isin(['Square Feet', 'SF']), round(data['AreaNumber'] * 0.09290304, 2), data['AreaNumber'])
data['AreaUnits'] = np.where(data['AreaUnits'].isin(['Square Feet', 'SF']), np.nan, data['AreaUnits'])

In [19]:
# dealing with meters squared
data['AreaUnits'] = np.where(data['AreaUnits'].isin(['Square metres', 'Square Metres', 'm²', 'MC', 'PC', 'pi²', None]), np.nan, data['AreaUnits'])

In [20]:
#manually correcting several values
data['AreaNumber'] = np.where(data['AreaUnits'] == 'x 39 ft', round(data['AreaNumber'] *3.62321856, 2), data['AreaNumber'])
data['AreaNumber'] = np.where(data['AreaUnits'] == 'x 71.60 ft', round(data['AreaNumber'] *6.651857664, 2), data['AreaNumber'])
data['AreaNumber'] = np.where(data['AreaUnits'] == 'x 16.78 M', round(data['AreaNumber'] *16.78, 2), data['AreaNumber'])
data['AreaNumber'] = np.where(data['AreaUnits'] == 'x 36.50 ft', round(data['AreaNumber'] *3.39096096, 2), data['AreaNumber'])
data['AreaNumber'] = np.where(data['AreaUnits'] == 'x 12.53 M', round(data['AreaNumber'] *12.53, 2), data['AreaNumber'])
data['AreaNumber'] = np.where(data['AreaUnits'] == 'x 47 ft', round(data['AreaNumber'] *4.36644288, 2), data['AreaNumber'])
data['AreaNumber'] = np.where(data['AreaUnits'] == 'x 30 ft', round(data['AreaNumber'] *2.7870912, 2), data['AreaNumber'])
data['AreaNumber'] = np.where(data['AreaUnits'] == 'x 24.30 ft', round(data['AreaNumber'] *2.257543872, 2), data['AreaNumber'])
data['AreaNumber'] = np.where(data['AreaUnits'] == 'x 28 ft', round(data['AreaNumber'] *2.60128512, 2), data['AreaNumber'])
data['AreaNumber'] = np.where(data['AreaUnits'] == 'x 34.80 ft', round(data['AreaNumber'] *3.233025792, 2), data['AreaNumber'])
data['AreaNumber'] = np.where(data['AreaUnits'] == 'x 10.52 ft', round(data['AreaNumber'], 2), data['AreaNumber'])
data['AreaNumber'] = np.where(data['AreaUnits'] == 'x 27 ft', round(data['AreaNumber'] *2.49124513, 2), data['AreaNumber'])
data.drop('AreaUnits', axis = 1, inplace=True)

Dealing with `Proximity` feature.

In [164]:
# version first
max(data['Proximity'].str.split(',').apply(lambda x: len(x)))
proximity = data['Proximity'].str.split(', ', n = 17, expand = True)
proximity.columns = [str(i) for i in range(16)]
iter = 0
for i in range(16):
    col = pd.get_dummies(proximity[str(i)], prefix = str(iter))
    iter += 1
    data = data.join(col)

In [21]:
data['Proximity'] = data['Proximity'].str.split(', ')

In [22]:
# version second
proximitySet = list(data['Proximity'])
proximity = list(set().union(*proximitySet))
proximity

['',
 'Quartier des spectacles',
 'Public transportation',
 'M�tro Monk',
 'Garderies',
 'th�atres',
 'Maisonneuve Park',
 'rue St-Denis',
 'Fairview shopping center',
 'Jardin Botanique',
 'Groceries',
 'clinic',
 'High school',
 'IGA',
 'bell centre',
 'plateau',
 'gym',
 'Place Versailles',
 'Maisonneuve Market',
 'March� Maisonneuve',
 "Les Galeries d'Anjou mall",
 'Aquadome,sports center',
 'Jardin botanique',
 'Riviere',
 'Supermarch�s',
 'De multiples services',
 'Old Port of Montral',
 'grocery store',
 'Restos-boutiques-Bar ',
 "Centre d'Achat Forest",
 'grocery stores',
 'Cin�ma Beaubien',
 'Restos-boutiques',
 'Commercial center',
 'M�tro Frontenac',
 'bicycle path',
 'Access to highway 20',
 'medical clinic',
 'Mountain',
 'acc�s autoroutes',
 'supermarket',
 'daycare',
 'Fairview Mall',
 'Restaurants et Bistros',
 'Lachine Canal',
 'Commerces',
 'All within walking distance.',
 'bank',
 'lac St-Louis',
 'German school AVH',
 'Grocery',
 'Centre sportif CEPSUM',
 'Autoroute

In [23]:
from sklearn.preprocessing import MultiLabelBinarizer
mlb = MultiLabelBinarizer(sparse_output=True)
data = data.join(pd.DataFrame.sparse.from_spmatrix(
                mlb.fit_transform(data.pop('Proximity')),
                index=data.index,
                columns=mlb.classes_))


In [29]:
data.drop(['', '+'], axis = 1, inplace=True)

In [31]:
#saving data into csv
data.to_csv('extra_data_modified.csv', encoding='utf-8', index=False)  