In [1]:
import pandas as pd
import re
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import word_tokenize
import nltk
from nltk.tokenize import PunktSentenceTokenizer


In [None]:
# Function to clean text using DeepSeek-R1
def clean_text_with_deepseek(text):
    prompt = f"""
    You are a real estate data cleaning assistant. Your task is to remove irrelevant information from property descriptions, such as contact details, HTML tags, promotional content, or anything unrelated to the property itself. Return only the cleaned description.

    Input: {text}
    Cleaned Description:
    """
    inputs = tokenizer(prompt, return_tensors="pt", truncation=True, max_length=512)
    outputs = model.generate(**inputs, max_length=200, temperature=0.1, num_return_sequences=1)
    cleaned_text = tokenizer.decode(outputs[0], skip_special_tokens=True)
    return cleaned_text

# Example usage
text = "<p>3-bedroom house <br>Contact: john@example.com</p>"
cleaned_text = clean_text_with_deepseek(text)
print(cleaned_text)

In [2]:
data = pd.read_parquet("../data/processed/processed.parquet")

In [3]:
data["description"]

0        Stylish two bedroom two bathroom apartment wit...
1        Ambassador are pleased to offer this recently ...
2        SHORT LET - A modern one bedroom apartment for...
3        Property Reference: 2288318.Spacious two bedro...
4        Lexadon are excited to bring a high spec one d...
                               ...                        
28503    LONG LET. A wonderful 2 bedroom flat boasting ...
28504    Stirling Ackroyd are proud to present this goo...
28505    Situated on the 8th floor of Lockgate Lodge, t...
28506    This smart one bedroom ground floor flat offer...
28507    A rare opportunity to be the tenant of this ex...
Name: description, Length: 28508, dtype: object

In [4]:
# ChatGPT generated Noizy Property Data to test 

noise_data_generated = [ """

Check out this amazing 2-bedroom apartment in the heart of the city with a fully-equipped kitchen! 
It’s just a short walk from the subway and close to all major shopping centers. Don't miss this fantastic opportunity. 
Contact us for more information at 0800-123-REAL. 
Visit https://realestate.com/listing456 to book a viewing.
"""
,
"""
A beautiful 3-bedroom house located in a family-friendly neighborhood. Enjoy a large garden and off-road parking! 
The house is walking distance from local schools and parks, making it ideal for families. 
Call us today at 0207-456-789 for a viewing or visit our website at www.exampleproperty.com.
"""
,
"""
Modern studio apartment available in the city center, just 5 minutes from the train station. 
It features an open-plan living area, a compact kitchen, and a private bathroom. Perfect for city workers! 
Get in touch now to schedule a tour. 
For more details, check our listing on https://www.realtysite.com/listing789.
"""
,
"""
This stylish 1-bedroom flat offers contemporary living in the bustling heart of London. 
It is well connected by public transport, with buses and trains just a stone’s throw away. 
If you're interested in this flat, reach out to us at info@luxuryapartments.com or call 0800-999-000.
"""
,

"""
Spacious 4-bedroom family home with a large garden and private driveway. The house features a cozy living room with a fireplace. 
Located in a peaceful neighborhood, this property is perfect for a growing family. 
Contact us today at +44 123-456-7890 for more information or visit our site for a virtual tour: www.familyhomes.com.
"""
,

"""
Fantastic 2-bedroom apartment with amazing views over the city skyline! Located near public transport, shops, and restaurants. 
This apartment also includes high-speed internet, perfect for remote workers. 
Book a viewing now at our website or contact our agent via WhatsApp at +44 700-123-456.
"""
,
"""
Charming 5-bedroom house with a large backyard, perfect for hosting family gatherings. The property is fully furnished and move-in ready. 
Located near a golf course and walking trails, this house is ideal for nature lovers. 
Call 0207-123-4567 to schedule a viewing or email info@dreamhomes.com.
"""
,
"""
Newly renovated 3-bedroom flat with a modern kitchen and stylish interiors. 
Located in the city’s most vibrant district, this apartment is within walking distance to cafes, shops, and art galleries. 
Visit our website for more details and to view our current offers: www.cityliving.com.
"""
,

"""
Gorgeous 2-bedroom flat in a historic building with original architectural features. 
This property combines old-world charm with modern conveniences and is perfect for those who love character. 
Book a tour today at info@historichomes.com or call +44 123-456-7890.
"""
,

"""
Spacious penthouse apartment with stunning views over the city and the river. It includes a private balcony and high-end appliances. 
Perfect for anyone seeking a luxurious lifestyle. 
For more information, contact us via email at penthouseview@luxuryhomes.com or call us at 0800-LUXURY-01.
"""]

In [5]:
def simple_text_clean(text):
      
    url_pattern = r"http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+]|[!*\\(\\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+"
    re.sub(url_pattern, "", text)
    
    tokenizer_sentences = PunktSentenceTokenizer()
    # Define the list of boilerplate phrases
    boilerplate_phrases = [
          "contact", "for more info", "call", "viewings","visit",
          "arrange", "available now", "great investment", "schedule",
          "agent", "short let", "long let", "terms","income","photo","please note"]

    # Tokenize the text into sentences
    sentences = tokenizer_sentences.tokenize(text)
    
    # Remove sentences containing boilerplate phrases
    cleaned_sentences = [sentence for sentence in sentences if not any(phrase in sentence.lower() for phrase in boilerplate_phrases)]
    
    return cleaned_sentences

In [6]:
noise_data_generated[0]

"\n\nCheck out this amazing 2-bedroom apartment in the heart of the city with a fully-equipped kitchen! \nIt’s just a short walk from the subway and close to all major shopping centers. Don't miss this fantastic opportunity. \nContact us for more information at 0800-123-REAL. \nVisit https://realestate.com/listing456 to book a viewing.\n"

In [7]:
simple_text_clean(noise_data_generated[0])

['\n\nCheck out this amazing 2-bedroom apartment in the heart of the city with a fully-equipped kitchen!',
 'It’s just a short walk from the subway and close to all major shopping centers.',
 "Don't miss this fantastic opportunity."]

In [8]:
noise_data_generated[1]

'\nA beautiful 3-bedroom house located in a family-friendly neighborhood. Enjoy a large garden and off-road parking! \nThe house is walking distance from local schools and parks, making it ideal for families. \nCall us today at 0207-456-789 for a viewing or visit our website at www.exampleproperty.com.\n'

In [9]:
simple_text_clean(noise_data_generated[1])

['\nA beautiful 3-bedroom house located in a family-friendly neighborhood.',
 'Enjoy a large garden and off-road parking!',
 'The house is walking distance from local schools and parks, making it ideal for families.']

In [10]:
text_test = noise_data_generated[-4]
text_test

'\nCharming 5-bedroom house with a large backyard, perfect for hosting family gatherings. The property is fully furnished and move-in ready. \nLocated near a golf course and walking trails, this house is ideal for nature lovers. \nCall 0207-123-4567 to schedule a viewing or email info@dreamhomes.com.\n'

In [11]:
print("\n".join(simple_text_clean(text_test)))


Charming 5-bedroom house with a large backyard, perfect for hosting family gatherings.
The property is fully furnished and move-in ready.
Located near a golf course and walking trails, this house is ideal for nature lovers.


In [12]:
text_test = data["description"].loc[4000]

print(text_test)

SHORT LET. A fantastic 2 bedroom 2nd and 3rd floor flat, situated within an attractive period conversion, offering stunning contemporary interiors and located in the sought-after area of Hampstead.Situated moments from the local amenities of Finchley Road, while the trendy shops, bars and restaurants of Hampstead are nearby. Please use the reference CHPK1240434 when contacting Foxtons.


In [13]:
print("\n".join(simple_text_clean(text_test)))

A fantastic 2 bedroom 2nd and 3rd floor flat, situated within an attractive period conversion, offering stunning contemporary interiors and located in the sought-after area of Hampstead.Situated moments from the local amenities of Finchley Road, while the trendy shops, bars and restaurants of Hampstead are nearby.


In [14]:
text_test = data["description"].loc[2300]
text_test

'LONG LET. An exquisite three bedroom house just moments away from the Centre of Kingston boasts fantastic living space, a beautiful marble kitchen with modern appliances and three spacious bedrooms.Grange Road is ideally located just moments from Kingston Centre and the wide veriety of amenities, bars and reataurants it has to offer, as well as being just down the from from Kingston University. Please use the reference CHPK2918094 when contacting Foxtons.'

In [15]:
print("\n".join(simple_text_clean(text_test)))

An exquisite three bedroom house just moments away from the Centre of Kingston boasts fantastic living space, a beautiful marble kitchen with modern appliances and three spacious bedrooms.Grange Road is ideally located just moments from Kingston Centre and the wide veriety of amenities, bars and reataurants it has to offer, as well as being just down the from from Kingston University.


In [16]:
text_test = data["description"].loc[457]

print(text_test)

LONG LET. A modern studio apartment in a brand new building boasting a sleek fitted kitchen, bright and airy interior and a well presented bedroom.Lockgate Lodge is set in a brand new development close to a wide range of amenities found in Lewisham as well as superb transport links from Lewisham DLR and Rail Station. Please use the reference B2RC5269320 when contacting Foxtons.


In [17]:
print("\n".join(simple_text_clean(text_test)))

A modern studio apartment in a brand new building boasting a sleek fitted kitchen, bright and airy interior and a well presented bedroom.Lockgate Lodge is set in a brand new development close to a wide range of amenities found in Lewisham as well as superb transport links from Lewisham DLR and Rail Station.


In [18]:
text_test = data["description"].loc[123]

print(text_test)

Looking for a 2 bed apartment to share with your best friend? Somewhere you can work from home without being under each others feet? We've got an amazing apartment with a *4 WEEKS RENT FREE" offer and co-working spaces for you to use! 

ADVERTISED RENT INCLUDES £2293.85 ANNUAL DISCOUNT FOR THE UNFURNISHED UNIT - EXCLUSIVE JANUARY OFFER**Concession* Price based on 12 contract with 4 Weeks rent free included. After 4 weeks rent free the monthly cost will be £2293.85 Terms & Conditions apply*.
 
Ten Degrees is located in the heart of Croydon, opposite Boxpark and East Croydon Station. Take a direct train to London Bridge and London Victoria Station in 15 minutes or Gatwick in 15 minutes.
 
Greystar are delighted to introduce our  1, 2 and 3 bedroom apartments at Ten Degrees - located conveniently opposite East Croydon Station. Ten degrees offers 43 floors of premium rental apartment with a lifestyle to match.
 
Our pet friendly apartments available furnished or unfurnished are designed to

In [19]:
print("\n".join(simple_text_clean(text_test)))

Looking for a 2 bed apartment to share with your best friend?
Somewhere you can work from home without being under each others feet?
We've got an amazing apartment with a *4 WEEKS RENT FREE" offer and co-working spaces for you to use!
ADVERTISED RENT INCLUDES £2293.85 ANNUAL DISCOUNT FOR THE UNFURNISHED UNIT - EXCLUSIVE JANUARY OFFER**Concession* Price based on 12 contract with 4 Weeks rent free included.
Ten Degrees is located in the heart of Croydon, opposite Boxpark and East Croydon Station.
Take a direct train to London Bridge and London Victoria Station in 15 minutes or Gatwick in 15 minutes.
Greystar are delighted to introduce our  1, 2 and 3 bedroom apartments at Ten Degrees - located conveniently opposite East Croydon Station.
Ten degrees offers 43 floors of premium rental apartment with a lifestyle to match.
Our pet friendly apartments available furnished or unfurnished are designed to the highest specification with floor to ceiling windows, Amtico flooring and spacious open p

In [20]:
text_test = data.iloc[1002]["description"]
text_test

"Indulge in the epitome of luxury living with this stunning two bedroom penthouse apartment in the vibrant City Road, Clerkenwell. Boasting a host of impressive features, this spacious property is sure to captivate even the most discerning of individuals.Step into the inviting semi open-planned lounge and kitchen area, which offers a versatile space for both relaxation and entertaining. The contemporary design is enhanced by floor-to-ceiling windows, bathing the property in an abundance of natural light and offering breathtaking views of the surrounding cityscape.The sleek and modern kitchen is fully equipped with state-of-the-art appliances, making cooking a delight for any culinary enthusiast. The spacious lounge seamlessly extends onto a private balcony, perfect for enjoying a morning coffee or evening drink while taking in the stunning urban panorama.Escape to tranquility with the private roof terrace, a true oasis high above the hustle and bustle of the city. Here, you can bask in

In [21]:
print("\n".join(simple_text_clean(text_test)))

Indulge in the epitome of luxury living with this stunning two bedroom penthouse apartment in the vibrant City Road, Clerkenwell.
Boasting a host of impressive features, this spacious property is sure to captivate even the most discerning of individuals.Step into the inviting semi open-planned lounge and kitchen area, which offers a versatile space for both relaxation and entertaining.
The contemporary design is enhanced by floor-to-ceiling windows, bathing the property in an abundance of natural light and offering breathtaking views of the surrounding cityscape.The sleek and modern kitchen is fully equipped with state-of-the-art appliances, making cooking a delight for any culinary enthusiast.
The spacious lounge seamlessly extends onto a private balcony, perfect for enjoying a morning coffee or evening drink while taking in the stunning urban panorama.Escape to tranquility with the private roof terrace, a true oasis high above the hustle and bustle of the city.
Here, you can bask in 

In [22]:
text_test = data.iloc[8002]["description"]
text_test

'Located on the 3rd floor of Egremont House, this light and bright, two bedroom, two bathroom home is available to rent fully-furnished from January.This East Village apartment sits\xa0less than five minutes’ walk to Stratford station making the morning commute a breeze. The double bedrooms \xa0come complete with built-in wardrobes, a chest of draws and side-tables. While kitchens are fully integrated with appliances including a multi-functional oven and induction hob. A private balcony \xa0connects the open plan kitchen and living area to the great outdoors.Wi-Fi is included and set up for when you move in, getting you connected right away. All Get Living homes have an energy efficiency rating of B or C, helping conserve energy for the planet, and your pocket. Being connected to the district heating and hot water system means no space-hogging boiler in your home, but please note that this means you cannot switch supplier. \xa0Additional information:Please note photos are for illustrat

In [23]:
print("\n".join(simple_text_clean(text_test)))

Located on the 3rd floor of Egremont House, this light and bright, two bedroom, two bathroom home is available to rent fully-furnished from January.This East Village apartment sits less than five minutes’ walk to Stratford station making the morning commute a breeze.
The double bedrooms  come complete with built-in wardrobes, a chest of draws and side-tables.
While kitchens are fully integrated with appliances including a multi-functional oven and induction hob.
A private balcony  connects the open plan kitchen and living area to the great outdoors.Wi-Fi is included and set up for when you move in, getting you connected right away.
All Get Living homes have an energy efficiency rating of B or C, helping conserve energy for the planet, and your pocket.
For 12-month tenancies, please speak to our teamA Security Deposit equal to five weeks’ rent, may be payable for this property.
The deposit will be held by the Tenancy Deposit Scheme:    All Get Living buildings have access to a lift and 

In [24]:
text_test = data.iloc[456]["description"]
text_test

"  PK Properties are delighted to offer this FULLY REFURBISHED 1 Bedroom Ground Floor Flat located walking distance to Harrow town centre.\xa0  Accommodation compromises of 1 Double Bedroom, Living Room, New Fitted Kitchen with appliances. Benefits include Gas Central Heating, Laminteed flooring throughout, Brand new fully tilled bathroom suite. Please Contact our Letting Team on   for a viewing or on What's App on   to view an internal videoAvailable UN Furnished with immediate occupation.\xa0  Local Area:The flat is located within in walking distance to West Harrow and Harrow on The Hill Station, well connected to Central London via the metropolitan Line and National Rail station and vast indoor and outdoor shopping outlets, and many more highly rated cafe's, restaurants and food chains. Also within close proximity to a variety of convenient Super stores, perfect for all your everyday shopping including Tesco's, Morrison's Iceland and post office.\xa0  Further Details:Council Tax: Ha

In [25]:
print("\n".join(simple_text_clean(text_test)))

  PK Properties are delighted to offer this FULLY REFURBISHED 1 Bedroom Ground Floor Flat located walking distance to Harrow town centre.
Accommodation compromises of 1 Double Bedroom, Living Room, New Fitted Kitchen with appliances.
Benefits include Gas Central Heating, Laminteed flooring throughout, Brand new fully tilled bathroom suite.
Local Area:The flat is located within in walking distance to West Harrow and Harrow on The Hill Station, well connected to Central London via the metropolitan Line and National Rail station and vast indoor and outdoor shopping outlets, and many more highly rated cafe's, restaurants and food chains.
Also within close proximity to a variety of convenient Super stores, perfect for all your everyday shopping including Tesco's, Morrison's Iceland and post office.
**


In [26]:
print("\n".join(simple_text_clean(text_test)))

  PK Properties are delighted to offer this FULLY REFURBISHED 1 Bedroom Ground Floor Flat located walking distance to Harrow town centre.
Accommodation compromises of 1 Double Bedroom, Living Room, New Fitted Kitchen with appliances.
Benefits include Gas Central Heating, Laminteed flooring throughout, Brand new fully tilled bathroom suite.
Local Area:The flat is located within in walking distance to West Harrow and Harrow on The Hill Station, well connected to Central London via the metropolitan Line and National Rail station and vast indoor and outdoor shopping outlets, and many more highly rated cafe's, restaurants and food chains.
Also within close proximity to a variety of convenient Super stores, perfect for all your everyday shopping including Tesco's, Morrison's Iceland and post office.
**


LLM case 

In [27]:
text_test = data.iloc[17809]["description"]
text_test

'Spacious 3 bedroom first floor flat * 2 double bedrooms, 1 single bedroom* bright and airy separate reception area* partly tilled bathroom* Property has access to a lot of storage space* direct access to fitted kitchen from the reception* Access to off road parkingFURNISHED/UNFURNISHED ~ AVAILABLE 3 OCTOBER'

In [28]:
print("\n".join(simple_text_clean(text_test)))

Spacious 3 bedroom first floor flat * 2 double bedrooms, 1 single bedroom* bright and airy separate reception area* partly tilled bathroom* Property has access to a lot of storage space* direct access to fitted kitchen from the reception* Access to off road parkingFURNISHED/UNFURNISHED ~ AVAILABLE 3 OCTOBER


In [29]:
text_test = data.iloc[3567]["description"]
text_test

'With a private entrance foyer, lift access to all floors, and a daytime porter, Trentham Court offers a sophisticated, secure, and convenient living experience.  This recently refurbished and spacious one-bedroom apartment is superbly located just moments from North Acton Station, providing excellent Central Line connections to Central London. The development features well-maintained communal gardens and enjoys South East-facing views, while local amenities, including grocery stores, restaurants, and takeaways, are conveniently situated just across the road.  Finished to the highest standards, the apartment includes a fully integrated kitchen with a fridge freezer, electric hob, cooker hood, and microwave, offering everything you need for modern living. For added convenience, the property is also equipped with a washer/dryer.  Early viewings are highly recommended—contact one of our Property Consultants today to arrange a visit.'

In [30]:
print("\n".join(simple_text_clean(text_test)))

With a private entrance foyer, lift access to all floors, and a daytime porter, Trentham Court offers a sophisticated, secure, and convenient living experience.
This recently refurbished and spacious one-bedroom apartment is superbly located just moments from North Acton Station, providing excellent Central Line connections to Central London.
The development features well-maintained communal gardens and enjoys South East-facing views, while local amenities, including grocery stores, restaurants, and takeaways, are conveniently situated just across the road.
Finished to the highest standards, the apartment includes a fully integrated kitchen with a fridge freezer, electric hob, cooker hood, and microwave, offering everything you need for modern living.
For added convenience, the property is also equipped with a washer/dryer.


In [31]:
text_test = data.iloc[87]["description"]
text_test

'A Georgian semi-detatched house to rent on Braywood Road, SE9. This house is with plenty of features including off-street parking, private garden, side access & More. Book in to view now!This stylish property is with two separate reception rooms on the ground floor giving great space for relaxing and intimate dining or an alternative reception room. The kitchen is separate from both rooms ideal for avid cookers with space for cooking and more in this Galley kitchen looking out to the private garden.The upper floor of the house is comprised of three bedrooms, two of which are sizeable doubles and another additional bedroom.Fitted with a family bathroom with bath tub and overhead shower. 0.1m to Falconwood station and close to Eltham High Street! Unfurnished and available now, this is not to be missed! To check broadband and mobile phone coverage please visit Ofcom here ofcom.org.uk/phones-telecoms-and-internet/advice-for-consumers/advice/ofcom-checker'

In [32]:
print("\n".join(simple_text_clean(text_test)))

A Georgian semi-detatched house to rent on Braywood Road, SE9.
This house is with plenty of features including off-street parking, private garden, side access & More.
Book in to view now!This stylish property is with two separate reception rooms on the ground floor giving great space for relaxing and intimate dining or an alternative reception room.
The kitchen is separate from both rooms ideal for avid cookers with space for cooking and more in this Galley kitchen looking out to the private garden.The upper floor of the house is comprised of three bedrooms, two of which are sizeable doubles and another additional bedroom.Fitted with a family bathroom with bath tub and overhead shower.
0.1m to Falconwood station and close to Eltham High Street!


In [33]:
text_test = data.iloc[6667]["description"]
text_test

"Are you looking for a stunning two bedroom apartment in the vibrant area of Stratford, E20? Look no further! This contemporary property is now available for rent and offers a range of attractive features.Situated on Montfichet Road, this apartment benefits from a convenient location that is surrounded by a multitude of amenities. With Westfield just around the corner, you'll have easy access to an abundance of shops, restaurants, and entertainment options right on your doorstep. Whether you're in the mood for a day of shopping or a night out with friends, everything you need is just a stone's throw away.As you step into the apartment, you'll immediately notice the plethora of natural light flooding through the large windows, creating a bright and airy atmosphere throughout the space. This beautiful feature enhances the modern finishing and complements the contemporary design of the property.The generous open-plan living area provides a perfect space for relaxation and entertainment, w

In [34]:
print("\n".join(simple_text_clean(text_test)))

Are you looking for a stunning two bedroom apartment in the vibrant area of Stratford, E20?
Look no further!
This contemporary property is now available for rent and offers a range of attractive features.Situated on Montfichet Road, this apartment benefits from a convenient location that is surrounded by a multitude of amenities.
With Westfield just around the corner, you'll have easy access to an abundance of shops, restaurants, and entertainment options right on your doorstep.
Whether you're in the mood for a day of shopping or a night out with friends, everything you need is just a stone's throw away.As you step into the apartment, you'll immediately notice the plethora of natural light flooding through the large windows, creating a bright and airy atmosphere throughout the space.
This beautiful feature enhances the modern finishing and complements the contemporary design of the property.The generous open-plan living area provides a perfect space for relaxation and entertainment, wi

In [35]:
data["description"].apply(lambda x: simple_text_clean(x))

0        [Stylish two bedroom two bathroom apartment wi...
1        [Ambassador are pleased to offer this recently...
2        [The flat has an open-plan kitchen/reception r...
3        [** Request Details form responded to 24/7, wi...
4        [Lexadon are excited to bring a high spec one ...
                               ...                        
28503    [A wonderful 2 bedroom flat boasting modern ac...
28504    [Stirling Ackroyd are proud to present this go...
28505    [Situated on the 8th floor of Lockgate Lodge, ...
28506    [This smart one bedroom ground floor flat offe...
28507    [A rare opportunity to be the tenant of this e...
Name: description, Length: 28508, dtype: object