In [1]:
import pandas as pd
import os
import json
import seaborn as sns
import matplotlib.pyplot as plt
import re

THE MAIN FINDINGS AND CONCLUSIONS IN THIS NOTEBOOK:

1. Price Reductions: Approximately one-third of the listings have reduced their prices. There is a corresponding column labeled "listingUpdateReason" that captures this information. This indicates that predictions model must take into account how long a flat was on portal before it's available date. 

2. Transportation Information: The dataset contains detailed transportation information, including the names, distances, and types of the three nearest stations. This richness could be valuable for location-based analyses.

3. Council Tax and Property Size: Data on council tax and property size is incomplete, with about half of the entries missing. However, these columns hold potential for estimation and could be useful if completed or imputed accurately.

4. Exclusion of Images and Schools: So far, images and school-related data have been excluded from the analysis. These could be explored in future iterations for additional insights.

5. The amount of scrapped data was almost equal to the expected size for London available in RightMove, meaning Apify is a reliable scrapping data source.

# Data Loading

In [2]:
# data location

data_dir = "../data/raw"
json_files = [os.path.join(data_dir, file) for file in os.listdir(data_dir) if file.endswith(('.json'))]


def get_stations(data):
    "Function gets 3 nearest stations for each property from data, which is list of property jsons"
    processed_data = []
    for entry in data:
        station_data = entry['nearestStations']
        flattened_data = {
            f'station_name{i+1}': station['name'] for i, station in enumerate(station_data)
             }
        flattened_data.update({
        f'distance_to_station{i+1}': station['distance'] for i, station in enumerate(station_data)
             })
        flattened_data.update({
        f'station_type{i+1}': station['types'] for i, station in enumerate(station_data)
             })
        processed_data.append(flattened_data)
    
    return pd.DataFrame(processed_data)


def get_coordinates(data):
    
    processed_data = []

    for entry in data:
        coordinates = entry['coordinates']
        flattened_data = {
            f'latitude': coordinates['latitude']
             }
        flattened_data.update({f'longitude': coordinates['longitude']}
           )
        processed_data.append(flattened_data)
        
    return pd.DataFrame(processed_data)


datasets = []

for batch in json_files:
    with open(batch, 'r') as file:
        
        js = json.load(file)
        dfs = pd.DataFrame(js)
        coord = get_coordinates(js)
        stations = get_stations(js)
        dfs = pd.concat([dfs,coord,stations],axis=1)
        datasets.append(dfs)
        
        
df = pd.concat(datasets)
df = df.drop(["coordinates","nearestStations"],axis=1) #  these were processed by the above functions 
df = df.drop_duplicates(subset="id")
df = df.set_index("id")  #  id is the unique property rightmove identifier 


  df = pd.concat(datasets)


# Data Cleaning

In [3]:
df.shape

(31434, 66)

![Alt Text](/home/zcemg08/projects/AIRFLOW_DATA_PROPERTIES/images/expected_df_size.png)


Finding.1  The number of rows as expected -> aplify reliable 

In [4]:
# For each column in dataframe show the number of missing values in descending order

missing_values = df.isna().sum()
missing_values_table = missing_values.reset_index()
missing_values_table.columns = ['Column', 'Missing Values']
missing_values_table = missing_values_table.sort_values(by='Missing Values', ascending=False)
missing_values_table.iloc[:10]

Unnamed: 0,Column,Missing Values
27,groundRentPercentageIncrease,31429
26,groundRentReviewPeriodInYears,30937
34,tenure,30026
25,annualGroundRent,29776
30,domesticRates,29612
28,annualServiceCharge,29025
35,yearsRemainingOnLease,28871
19,minimumTermInMonths,26234
53,sizeSqFeetMax,19892
52,sizeSqFeetMin,19892


In [5]:
# Remove columns (>95% empty in dataset)

df = df.drop(["groundRentPercentageIncrease", 
              "groundRentReviewPeriodInYears",
              "tenure",
              "annualGroundRent",
              "domesticRates", 
              "annualServiceCharge",
              "yearsRemainingOnLease", 
              "minimumTermInMonths"],axis=1)

In [6]:
# Display remaining columns 

df.columns

Index(['url', 'title', 'displayAddress', 'countryCode', 'deliveryPointId',
       'ukCountry', 'outcode', 'incode', 'bathrooms', 'bedrooms', 'agent',
       'agentPhone', 'agentLogo', 'agentDisplayAddress', 'propertyType',
       'price', 'secondaryPrice', 'letAvailableDate', 'deposit', 'letType',
       'furnishType', 'type', 'councilTaxExempt', 'councilTaxIncluded',
       'councilTaxBand', 'description', 'descriptionHtml', 'features',
       'images', 'brochures', 'floorplans', 'epc', 'published', 'archived',
       'sold', 'tags', 'agentProfileUrl', 'agentListingsUrl',
       'agentDescriptionHtml', 'listingUpdateReason', 'listingUpdateDate',
       'firstVisibleDate', 'displayStatus', 'addedOn', 'sizeSqFeetMin',
       'sizeSqFeetMax', 'latitude', 'longitude', 'station_name1',
       'station_name2', 'station_name3', 'distance_to_station1',
       'distance_to_station2', 'distance_to_station3', 'station_type1',
       'station_type2', 'station_type3', 'nearestSchools'],
      dtyp

In [7]:
# there are no plans for CNNs at the moment

df["images"].iloc[0]

['https://media.rightmove.co.uk/16k/15948/154825190/15948_1304110_IMG_01_0000.jpeg',
 'https://media.rightmove.co.uk/16k/15948/154825190/15948_1304110_IMG_00_0000.jpeg',
 'https://media.rightmove.co.uk/16k/15948/154825190/15948_1304110_IMG_02_0000.jpeg',
 'https://media.rightmove.co.uk/16k/15948/154825190/15948_1304110_IMG_03_0000.jpeg',
 'https://media.rightmove.co.uk/16k/15948/154825190/15948_1304110_IMG_04_0000.jpeg',
 'https://media.rightmove.co.uk/16k/15948/154825190/15948_1304110_IMG_05_0000.jpeg',
 'https://media.rightmove.co.uk/16k/15948/154825190/15948_1304110_IMG_06_0000.jpeg',
 'https://media.rightmove.co.uk/16k/15948/154825190/15948_1304110_IMG_07_0000.jpeg',
 'https://media.rightmove.co.uk/16k/15948/154825190/15948_1304110_IMG_08_0000.jpeg',
 'https://media.rightmove.co.uk/16k/15948/154825190/15948_1304110_IMG_09_0000.jpeg',
 'https://media.rightmove.co.uk/16k/15948/154825190/15948_1304110_IMG_10_0000.jpeg',
 'https://media.rightmove.co.uk/16k/15948/154825190/15948_1304110

In [8]:
# Remove:  
# agents info, 
# ukCountry -> all in england, 
# type -> always rent, 
# brochures -> agents adds", 
# Image -> require CNN,
# deliveryPointId -> specific to rightmove",
# nearestSchools -> will be used later
# countryCode -> all GB, 
# url - not needed for predictions, 

df = df.drop(['agent', 'agentPhone', 'agentLogo', 'agentDisplayAddress',"brochures",
              'agentProfileUrl', 'agentListingsUrl', 'agentDescriptionHtml',
              "images","deliveryPointId","type","ukCountry","countryCode","url","nearestSchools"],axis=1)

In [9]:
# descriptionHtml can be dropped, since its meaning is stored in description
df[['description', 'descriptionHtml']]

Unnamed: 0_level_0,description,descriptionHtml
id,Unnamed: 1_level_1,Unnamed: 2_level_1
154825190,SHORT LET. Outstanding 3 bedroom house in Pars...,SHORT LET. Outstanding 3 bedroom house in Pars...
155893427,SHORT LET. Stunning 4 bedroom house close to c...,SHORT LET. Stunning 4 bedroom house close to c...
154825184,SHORT LET. Located on a pretty lane in central...,SHORT LET. Located on a pretty lane in central...
154274906,* SHORT LET *This beautifully refurbished 6-be...,* SHORT LET *<br /><br />This beautifully refu...
152063288,ONE OF THE LARGEST PENTHOUSES HERE AT CHELSEA ...,ONE OF THE LARGEST PENTHOUSES HERE AT CHELSEA ...
...,...,...
156015842,Property Reference: 2299821.KEY FEATURESSuperb...,Property Reference: 2299821.<br /><br /><b>KEY...
152958695,"SHORT LET. Moments from Wembley Park, this con...","SHORT LET. Moments from Wembley Park, this con..."
152955419,SHORT LET. This fantastic 3rd floor 2 bed flat...,SHORT LET. This fantastic 3rd floor 2 bed flat...
147232748,UNCLE Wembley - Stunningly designed Wembley fl...,UNCLE Wembley - Stunningly designed Wembley fl...


In [10]:
df = df.drop(['descriptionHtml'],axis=1)

Check weird columns 

In [11]:
df["epc"] # energy ratings need CNN to get label A,B,..F rank form image: -> remove

id
154825190    https://media.rightmove.co.uk/16k/15948/154825...
155893427    https://media.rightmove.co.uk/16k/15948/155893...
154825184    https://media.rightmove.co.uk/16k/15948/154825...
154274906    https://media.rightmove.co.uk/45k/44645/154274...
152063288    https://media.rightmove.co.uk/132k/131152/1520...
                                   ...                        
156015842                                                 None
152958695    https://media.rightmove.co.uk/167k/166331/1529...
152955419    https://media.rightmove.co.uk/167k/166331/1529...
147232748                                                 None
155503607         https://www.epcgraph.co.uk/epc.png?86,86,,,M
Name: epc, Length: 31434, dtype: object

In [12]:
df = df.drop(['epc'],axis=1)

In [13]:
# published, archived, sold, tags and displayStatus are almost constants -> remove

print("Published counts:")
print(df['published'].value_counts())

print("\nArchived counts:")
print(df['archived'].value_counts())

print("\nSold counts:")
print(df['sold'].value_counts())

print("\nTags:")
print(df['tags'].value_counts())

print("\nDisplayStatus:")
print(df['displayStatus'].value_counts())

Published counts:
published
True     31430
False        4
Name: count, dtype: int64

Archived counts:
archived
False    31430
True         4
Name: count, dtype: int64

Sold counts:
sold
False    31434
Name: count, dtype: int64

Tags:
tags
[]                               29044
[NEW_HOME]                        1262
[BUILT_FOR_RENTERS]                846
[BUILT_FOR_RENTERS, NEW_HOME]      281
[LET_AGREED, NEW_HOME]               1
Name: count, dtype: int64

DisplayStatus:
displayStatus
    31434
Name: count, dtype: int64


In [14]:
df = df.drop(["published",	"archived",	"sold", "tags", "displayStatus"],axis=1)

# Resulting Dataset 

## 1. Transport data to 3 nearest stations 

In [15]:
transport_columns = ['station_name1', 'station_name2', 'station_name3', 
                     'distance_to_station1','distance_to_station2', 'distance_to_station3',
                     'station_type1', 'station_type2', 'station_type3']

df[transport_columns].head(4)

Unnamed: 0_level_0,station_name1,station_name2,station_name3,distance_to_station1,distance_to_station2,distance_to_station3,station_type1,station_type2,station_type3
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
154825190,Parsons Green Station,Fulham Broadway Station,Imperial Wharf Station,0.086686,0.375075,0.713814,[LONDON_UNDERGROUND],[LONDON_UNDERGROUND],[NATIONAL_TRAIN]
155893427,Parsons Green Station,Imperial Wharf Station,Fulham Broadway Station,0.400948,0.496335,0.584006,[LONDON_UNDERGROUND],[NATIONAL_TRAIN],[LONDON_UNDERGROUND]
154825184,Parsons Green Station,Fulham Broadway Station,Imperial Wharf Station,0.086686,0.375075,0.713814,[LONDON_UNDERGROUND],[LONDON_UNDERGROUND],[NATIONAL_TRAIN]
154274906,Putney Bridge Station,Parsons Green Station,Putney Station,0.140975,0.488032,0.699539,[LONDON_UNDERGROUND],[LONDON_UNDERGROUND],[NATIONAL_TRAIN]


In [16]:
df[transport_columns].isna().sum()

station_name1           2
station_name2           2
station_name3           3
distance_to_station1    2
distance_to_station2    2
distance_to_station3    3
station_type1           2
station_type2           2
station_type3           3
dtype: int64

1. data outliers -> removes rows where stations data is not present

In [17]:
df = df[~df["station_name1"].isna()]
df[["distance_to_station1",	"distance_to_station2",	"distance_to_station3"]] = df[["distance_to_station1",	"distance_to_station2",	"distance_to_station3"]].astype("float16")

## 2. Finantial details

In [18]:
finantial_columns = ['price', 'secondaryPrice', 'deposit', 'councilTaxExempt','councilTaxIncluded', 'councilTaxBand']

df[finantial_columns].head(4)

Unnamed: 0_level_0,price,secondaryPrice,deposit,councilTaxExempt,councilTaxIncluded,councilTaxBand
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
154825190,"£18,633 pcm","£4,300 pw",4300.0,False,False,
155893427,"£13,000 pcm","£3,000 pw",3000.0,False,False,
154825184,"£14,300 pcm","£3,300 pw",3300.0,False,False,
154274906,"£15,000 pcm","£3,462 pw",10385.0,False,False,


In [19]:
df[finantial_columns].isna().sum()

price                     0
secondaryPrice            0
deposit                6492
councilTaxExempt          0
councilTaxIncluded        0
councilTaxBand        16403
dtype: int64

1. Deposit will be biniresed yes/no to avoid price prediction data leak, therefore Nulll in deposit data is not a problem
2. Despite councilTaxBand 50% of data missiing, it can be filled with predictions 
3. Transform prices

In [20]:
df['deposit'].fillna(0, inplace=True)
df['deposit'] = df['deposit'].apply(lambda x: True if x != 0 else False)
df['deposit'] = df['deposit'].astype(bool)

In [21]:
print("fraction of houses with uknown council tax={}".format(df["councilTaxBand"].isna().sum()/len(df)))

fraction of houses with uknown council tax=0.5218567065411046


In [22]:
def transform_price(x):
    string = x.split(" ")[0]
    numbers = re.findall(r'\d+', string)
    return int(''.join(numbers))

df["price"] = df["price"].apply(lambda x: transform_price(x))
df["secondaryPrice"] = df["secondaryPrice"].apply(lambda x: transform_price(x))

## 3. Address 

In [23]:
location_columns = [ 'displayAddress','outcode', 'incode', 'latitude', 'longitude']

df[location_columns].head(3)

Unnamed: 0_level_0,displayAddress,outcode,incode,latitude,longitude
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
154825190,"Novello Street, Parsons Green, London, SW6",SW6,4JB,51.47583,-0.199436
155893427,"Studdridge Street, Fulham, London, SW6",SW6,3SL,51.472097,-0.193479
154825184,"Novello Street, Parsons Green, London, SW6",SW6,4JB,51.47583,-0.199436


In [24]:
df[location_columns].isna().sum()

displayAddress    0
outcode           0
incode            0
latitude          0
longitude         0
dtype: int64

In [25]:
df[["latitude","longitude"]] = df[["latitude","longitude"]].astype(float)

In [26]:
# inconsitent format leave as it is 
df["displayAddress"].apply(lambda x: x.split(","))

id
154825190    [Novello Street,  Parsons Green,  London,  SW6]
155893427        [Studdridge Street,  Fulham,  London,  SW6]
154825184    [Novello Street,  Parsons Green,  London,  SW6]
154274906             [Ranelagh Avenue,  \nHurlingham,  SW6]
152063288     [Compass House,  Chelsea Creek,  Fulham,  SW6]
                                  ...                       
156015842                   [Elizabeth House,  London,  HA9]
152958695    [Matthews Close,  Wembley Park,  Wembley,  HA9]
152955419    [Matthews Close,  Wembley Park,  Wembley,  HA9]
147232748            [Park Lane,  Wembley,  Middlesex,  HA9]
155503607                            [Bowery,  London,  HA9]
Name: displayAddress, Length: 31432, dtype: object

## 4. Property Features

In [27]:
property_features = ['bedrooms',"bathrooms","propertyType",'sizeSqFeetMin', 'sizeSqFeetMax',"floorplans"]

df[property_features].head(5)

Unnamed: 0_level_0,bedrooms,bathrooms,propertyType,sizeSqFeetMin,sizeSqFeetMax,floorplans
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
154825190,3.0,2.0,House,1439.0,1439.0,[{'url': 'https://media.rightmove.co.uk/16k/15...
155893427,4.0,4.0,House,2479.0,2479.0,[{'url': 'https://media.rightmove.co.uk/16k/15...
154825184,4.0,2.0,House,1185.0,1185.0,[{'url': 'https://media.rightmove.co.uk/16k/15...
154274906,6.0,3.0,Terraced,2879.0,2879.0,[]
152063288,5.0,6.0,Penthouse,4015.0,4015.0,[{'url': 'https://media.rightmove.co.uk/132k/1...


1. floorplans to be dropped, CNNs next time
2. sizeSqFeetMin, sizeSqFeetMax can be estimated
3. remove outliers

In [28]:
df[property_features].isna().sum()

bedrooms          2665
bathrooms         1802
propertyType         0
sizeSqFeetMin    19890
sizeSqFeetMax    19890
floorplans           0
dtype: int64

In [29]:
# negligible ouliers

len(df[df["bedrooms"].isna() & df["bathrooms"].isna()])/len(df)

0.016034614405701197

In [30]:
df = df[~(df["bedrooms"].isna() & df["bathrooms"].isna())] # remove negligible ouliers
df = df.drop("floorplans",axis=1)

df["sizeSqFeetMax"] = df["sizeSqFeetMax"].fillna(0)
df["sizeSqFeetMin"] = df["sizeSqFeetMin"].fillna(0)
df[["sizeSqFeetMax","sizeSqFeetMin"]] = df[["sizeSqFeetMax","sizeSqFeetMin"]].astype(float)

In [31]:
len(df[df["bathrooms"].isna()])/len(df)

0.041968442834971545

In [32]:
df["bathrooms"] = df["bathrooms"].fillna(1) # at least one toilet is present 

In [33]:
df[df["bedrooms"].isna()]["propertyType"].value_counts()

propertyType
Studio                          1313
Apartment                        377
Flat                             333
House Share                       72
Ground Flat                       13
House                             11
Flat Share                         7
Not Specified                      6
Terraced                           5
Parking                            4
Serviced Apartments                4
Block of Apartments                3
Garages                            3
Duplex                             2
Retirement Property                2
Detached                           2
Semi-Detached                      1
House of Multiple Occupation       1
House Boat                         1
Hotel Room                         1
Name: count, dtype: int64

In [34]:
# remove outliers bedrooms are none, except for studio

df = df[~((df["propertyType"] != "Studio") & (df["bedrooms"].isna()))]


## 4. Time related variables

In [35]:
time_related = ['addedOn','firstVisibleDate', 'letAvailableDate', 'listingUpdateReason', 'listingUpdateDate']

df[time_related].head(5)

Unnamed: 0_level_0,addedOn,firstVisibleDate,letAvailableDate,listingUpdateReason,listingUpdateDate
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
154825190,11/11/2024,2024-11-11T11:43:21Z,Ask agent,new,2024-11-11T11:49:03Z
155893427,Reduced on 06/11/2024,2024-12-12T12:42:10Z,Ask agent,price_reduced,2024-11-06T10:44:16Z
154825184,Reduced on 17/12/2024,2024-11-11T11:43:18Z,Ask agent,price_reduced,2024-12-17T11:40:51Z
154274906,Reduced on 12/11/2024,2024-10-28T11:12:35Z,Now,price_reduced,2024-11-12T11:58:11Z
152063288,05/09/2024,2024-09-05T10:10:35Z,Now,new,2024-09-05T10:16:03Z


In [36]:
df[time_related].isna().sum()

addedOn                11
firstVisibleDate        0
letAvailableDate        0
listingUpdateReason    10
listingUpdateDate      10
dtype: int64

In [37]:
df = df[~df["addedOn"].isna()] 

In [38]:
df[df["listingUpdateDate"].isna()][time_related] # datapoint could be added recently, but for data manipulation convinience will be removed

Unnamed: 0_level_0,addedOn,firstVisibleDate,letAvailableDate,listingUpdateReason,listingUpdateDate
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
156543143,Added today,2025-01-06T16:35:34Z,19/02/2025,,


In [39]:
df = df[~df["listingUpdateDate"].isna()]

In [40]:
df["listingUpdateReason"].value_counts()

listingUpdateReason
new              20748
price_reduced     9320
Name: count, dtype: int64

###  SIGNIFICANT DISCOVERY: THE SAME LISTINGS CHANGE PRICE (VERY FREQUENTLY IN DATA) !

In [41]:
df["listingUpdateDate"] =  pd.to_datetime(df['listingUpdateDate']).dt.date
df["firstVisibleDate"] =  pd.to_datetime(df["firstVisibleDate"]).dt.date

In [42]:
df2 = df.copy()
df2["counter"] = df2["addedOn"].apply(lambda x: len(x.split()))

In [43]:
df2[(df2["counter"]==2) & (df2["listingUpdateReason"]=="price_reduced")][time_related]

Unnamed: 0_level_0,addedOn,firstVisibleDate,letAvailableDate,listingUpdateReason,listingUpdateDate
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
156527702,Reduced yesterday,2025-01-06,Now,price_reduced,2025-01-06
155936288,Reduced yesterday,2024-12-13,Now,price_reduced,2025-01-06
155592647,Reduced yesterday,2024-12-02,31/01/2025,price_reduced,2025-01-06
155469410,Reduced yesterday,2024-11-28,Now,price_reduced,2025-01-06
155834402,Reduced yesterday,2024-12-10,17/02/2025,price_reduced,2025-01-06
...,...,...,...,...,...
156532793,Reduced yesterday,2025-01-06,01/02/2025,price_reduced,2025-01-07
142689713,Reduced today,2023-12-05,Now,price_reduced,2025-01-08
156461729,Reduced today,2025-01-03,Now,price_reduced,2025-01-05
154353677,Reduced yesterday,2024-10-29,Now,price_reduced,2025-01-04


Dates need further investigation,however at this stage of cleaning/missing values its enough

In [44]:
del df2

## 5. Text columns 

In [45]:
nlp_columns = ["title","description","features"]

df[nlp_columns].head(7)

Unnamed: 0_level_0,title,description,features
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
154825190,"3 bedroom house for rent in Novello Street, Pa...",SHORT LET. Outstanding 3 bedroom house in Pars...,"[Outstanding 3 bedroom house in Parsons Green,..."
155893427,"4 bedroom house for rent in Studdridge Street,...",SHORT LET. Stunning 4 bedroom house close to c...,[Stunning 4 bedroom multi-floor house with a b...
154825184,"4 bedroom house for rent in Novello Street, Pa...",SHORT LET. Located on a pretty lane in central...,"[Large reception room with fireplace, Kitchen/..."
154274906,6 bedroom terraced house for rent in Ranelagh ...,* SHORT LET *This beautifully refurbished 6-be...,[]
152063288,"5 bedroom penthouse for rent in Compass House,...",ONE OF THE LARGEST PENTHOUSES HERE AT CHELSEA ...,[One Of The Largest Penthouses At Chelsea Cree...
155916665,5 bedroom detached house for rent in Fulham Pa...,The property features two sensational double r...,"[Detached double fronted house, Five bedrooms,..."
152396447,"5 bedroom flat for rent in Compass House, \n5 ...","An unbelievable five bedroom, six-bathroom pen...","[5 bedrooms, 2 reception rooms, 6 bathrooms, L..."


In [46]:
df[nlp_columns].isna().sum()

title          0
description    0
features       0
dtype: int64

In [47]:
df[nlp_columns].dtypes

title          object
description    object
features       object
dtype: object

Final Pre-Cleaned DataFrame

In [48]:
df

Unnamed: 0_level_0,title,displayAddress,outcode,incode,bathrooms,bedrooms,propertyType,price,secondaryPrice,letAvailableDate,...,longitude,station_name1,station_name2,station_name3,distance_to_station1,distance_to_station2,distance_to_station3,station_type1,station_type2,station_type3
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
154825190,"3 bedroom house for rent in Novello Street, Pa...","Novello Street, Parsons Green, London, SW6",SW6,4JB,2.0,3.0,House,18633,4300,Ask agent,...,-0.199436,Parsons Green Station,Fulham Broadway Station,Imperial Wharf Station,0.086670,0.375000,0.713867,[LONDON_UNDERGROUND],[LONDON_UNDERGROUND],[NATIONAL_TRAIN]
155893427,"4 bedroom house for rent in Studdridge Street,...","Studdridge Street, Fulham, London, SW6",SW6,3SL,4.0,4.0,House,13000,3000,Ask agent,...,-0.193479,Parsons Green Station,Imperial Wharf Station,Fulham Broadway Station,0.400879,0.496338,0.583984,[LONDON_UNDERGROUND],[NATIONAL_TRAIN],[LONDON_UNDERGROUND]
154825184,"4 bedroom house for rent in Novello Street, Pa...","Novello Street, Parsons Green, London, SW6",SW6,4JB,2.0,4.0,House,14300,3300,Ask agent,...,-0.199436,Parsons Green Station,Fulham Broadway Station,Imperial Wharf Station,0.086670,0.375000,0.713867,[LONDON_UNDERGROUND],[LONDON_UNDERGROUND],[NATIONAL_TRAIN]
154274906,6 bedroom terraced house for rent in Ranelagh ...,"Ranelagh Avenue, \nHurlingham, SW6",SW6,3PJ,3.0,6.0,Terraced,15000,3462,Now,...,-0.206303,Putney Bridge Station,Parsons Green Station,Putney Station,0.140991,0.488037,0.699707,[LONDON_UNDERGROUND],[LONDON_UNDERGROUND],[NATIONAL_TRAIN]
152063288,"5 bedroom penthouse for rent in Compass House,...","Compass House, Chelsea Creek, Fulham, SW6",SW6,2FB,6.0,5.0,Penthouse,14998,3461,Now,...,-0.185162,Imperial Wharf Station,Fulham Broadway Station,Parsons Green Station,0.099426,0.550781,0.694336,[NATIONAL_TRAIN],[LONDON_UNDERGROUND],[LONDON_UNDERGROUND]
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
156015842,1 bedroom penthouse for rent in Elizabeth Hous...,"Elizabeth House, London, HA9",HA9,6DB,1.0,1.0,Penthouse,1825,421,Now,...,-0.287843,Wembley Stadium Station,Wembley Central Station,Wembley Park Station,0.126709,0.369385,0.789551,[NATIONAL_TRAIN],"[NATIONAL_TRAIN, LONDON_UNDERGROUND, LONDON_OV...",[LONDON_UNDERGROUND]
152958695,"2 bedroom flat for rent in Matthews Close, Wem...","Matthews Close, Wembley Park, Wembley, HA9",HA9,8FG,2.0,2.0,Flat,3640,840,Ask agent,...,-0.284004,Wembley Park Station,Preston Road Station,Wembley Stadium Station,0.227295,0.713379,0.703613,[LONDON_UNDERGROUND],[LONDON_UNDERGROUND],[NATIONAL_TRAIN]
152955419,"2 bedroom flat for rent in Matthews Close, Wem...","Matthews Close, Wembley Park, Wembley, HA9",HA9,8FD,2.0,2.0,Flat,4000,923,Ask agent,...,-0.283130,Wembley Park Station,Wembley Stadium Station,Preston Road Station,0.177368,0.672363,0.767090,[LONDON_UNDERGROUND],[NATIONAL_TRAIN],[LONDON_UNDERGROUND]
147232748,"2 bedroom flat for rent in Park Lane, Wembley,...","Park Lane, Wembley, Middlesex, HA9",HA9,7RH,2.0,2.0,Flat,2450,565,Ask agent,...,-0.292910,Wembley Central Station,Wembley Stadium Station,North Wembley Station,0.189453,0.296143,0.771973,"[NATIONAL_TRAIN, LONDON_UNDERGROUND, LONDON_OV...",[NATIONAL_TRAIN],"[LONDON_UNDERGROUND, LONDON_OVERGROUND]"


In [49]:
df.to_parquet("../data/processed/rent_london.parquet")