In [52]:
import pandas as pd
import numpy as np

In [100]:
reviews = pd.read_csv("http://data.insideairbnb.com/canada/on/toronto/2024-01-08/data/reviews.csv.gz")

In [101]:
# Remove missing values

reviews.dropna(inplace=True)

In [102]:
import re
from collections import defaultdict

# Function to find HTML tags for inspection
def find_html_tags(text_series):
    tag_counts = defaultdict(int)
    pattern = re.compile(r'<[^>]+>')
    for text in text_series.dropna():
        found_tags = pattern.findall(text)
        for tag in found_tags:
            tag_counts[tag] += 1
    return sorted(tag_counts.items(), key=lambda x: x[1], reverse=True)

# Inspect HTML tags
html_tags = find_html_tags(reviews['comments'])
print("HTML tags found:", html_tags)


HTML tags found: [('<br/>', 174625), ('<3<br/>', 10), ('<3.<br/>', 2), ('<3 <br/>', 2), ('<5 min walk away, with Metro a little further down) and lots of coffee shops in the area. <br/>', 1), ('<5min stroll)<br/>', 1), ('<20mins to Services Canada centre which is the place for getting social insurance number.<br/>', 1), ('<< Premier arrivé premier servi >', 1), ('<3, tourist information, street and subway maps, etc.. Matching sets of good quality cutlery, utensils, dishes, glasses, pots and pans etc..<br/>', 1), ('<5 mins walk to the subway,<br/>', 1), ('<15 mins walk to a library or the laundry,<br/>', 1), ("<1 mile) there are so many fun things to see, do, and eat on the streets surrounding the apartment. We purchased a public parking pass and had no issues finding street parking close to Maria's place.<br/>", 1), ('<3 Everything was so tidy and clean. Also, the dog in her home is really friendly and cute :) I will definitely go her place again next time!<br/>', 1), ('<10 minutes wal

In [103]:
# It seems our code above is catching instances that are not html tags, so lets edit our code to catch only html tags

import re
from collections import defaultdict

# Function to find HTML tags for inspection
def find_html_tags(text_series):
    tag_counts = defaultdict(int)
    # Adjusted pattern to exclude tags that do not start with a letter or a '/'
    pattern = re.compile(r'<\/?[a-z]+[^>]*>', re.IGNORECASE)
    for text in text_series.dropna():
        found_tags = pattern.findall(text)
        for tag in found_tags:
            tag_counts[tag] += 1
    return sorted(tag_counts.items(), key=lambda x: x[1], reverse=True)

# Inspect HTML tags
html_tags = find_html_tags(reviews['comments'])
for tag, count in html_tags:
    print(f"{tag}: {count}")


<br/>: 174670
<must see>: 1
<must do>: 1
<Toronto Transit Commission>: 1
<Union Pearson Express>: 1


In [104]:
## Function to remove <br/> tags
def remove_br_tags(text):
    # Check if text is not NaN
    if pd.notna(text):
        return re.sub(r'<br\s*/?>', ' ', text)  # Replace <br/> with space
    return text

# Apply the function to each comment to remove <br/> tags
reviews['cleaned_comments'] = reviews['comments'].apply(remove_br_tags)

In [105]:
# Lets check to see they we're removed correctly

# Function to find HTML tags for inspection
def find_html_tags(text_series):
    tag_counts = defaultdict(int)
    pattern = re.compile(r'<[^>]+>')
    for text in text_series.dropna():
        found_tags = pattern.findall(text)
        for tag in found_tags:
            tag_counts[tag] += 1
    return sorted(tag_counts.items(), key=lambda x: x[1], reverse=True)

# Inspect HTML tags
html_tags = find_html_tags(reviews['cleaned_comments'])
print("HTML tags found:", html_tags)

HTML tags found: [('<< Premier arrivé premier servi >', 1), ('<10 minutes walk distance to nearest TTC Subway Station (and 1-2 minute walk to nearest Bus Station) - ard. 15 minutes to downtown and <10 minutes to north york via Subway. There is supermarkets within 10 minutes of walk and pharmacy within 15 minutes. The room is quite spacious and you will have all you need in this basement apartment. If you prefer cooking (like me), there are good range of utensils and appliances in the kitchen. The host has also decorated the rooms with plentiful of paintings, which makes it quite soothing. The host is very caring and responsive. She is living upstairs separately but respects privacy a lot. So if you prefer having more personal time, this is the place for you. One thing to note for very tall people - like most of the basement apartment, the ceiling of this listing is not very high. But generally it is good for average people (i am >', 1), ('<<must see>', 1), ('<<must do>', 1), ('<Toronto

In [106]:
# Now let's look for URLS

def find_urls(text):
    # This regular expression pattern looks for URL formats
    # including those that start with 'www.', 'http://', 'https://', and also considers some common top-level domains.
    url_pattern = r'(https?://[^\s]+)|(www\.[^\s]+)|(http?://[^\s]+)'
    urls = re.findall(url_pattern, text)
    # URLs could be in a tuple if multiple patterns match, we flatten and filter empty strings
    urls = [url for group in urls for url in group if url]
    return urls

# Apply the function to each review and display the comments with URLs
comments_with_urls = [(review, find_urls(review)) for review in reviews['cleaned_comments'] if find_urls(review)]
for comment, urls in comments_with_urls:
    print(f"Comment: {comment}\nURLs: {', '.join(urls)}\n")

 If you drive, make sure you purchase a parking permit (available online here: https://secure.toronto.ca/wes/eTPP/welcome.do)
URLs: https://secure.toronto.ca/wes/eTPP/welcome.do)

 This room was on top floor and bathroom/shower is on basement, which means 3 flights of stairs to brave for any such need. Not Amy or Grame's fault by any mean, but we didn't pay attention on booking of that detail and with a recovering sprained ankle - we should have... Be warned but overall that's the only minus and no deal breaker.
URLs: https://www.airbnb.ca/rooms/1510402/guidebook

 If you are lucky enough to stay with Caroline, you will never go anywhere else. A truly unique and special Toronto experience. olished silver flatware. We felt like we were in a 5 star hotel.romantic place in time.
URLs: http://www.mariakillam.com/colourhallways/

Comment: https://www.airbnb.com/reviews/11576172/edit#  This 2 night stay was perfect  for me.  Gg and Archer accepted my request with very little notice and were 

In [107]:
# Remove URLS

def remove_urls(text):
    # This pattern matches most types of URLs
    url_pattern = r'https?://[^\s]+|www\.[^\s]+|http?://[^\s]+'
    # Replace found URLs with an empty string
    return re.sub(url_pattern, '', text)

# Apply the function to remove URLs from each comment
reviews['cleaned_comments'] = reviews['cleaned_comments'].apply(remove_urls)

In [108]:
# Lets check to make sure they were removed correctly

def find_urls(text):
    # This regular expression pattern looks for URL formats
    # including those that start with 'www.', 'http://', 'https://', and also considers some common top-level domains.
    url_pattern = r'(https?://[^\s]+)|(www\.[^\s]+)|(http?://[^\s]+)'
    urls = re.findall(url_pattern, text)
    # URLs could be in a tuple if multiple patterns match, we flatten and filter empty strings
    urls = [url for group in urls for url in group if url]
    return urls

# Apply the function to each review and display the comments with URLs
comments_with_urls = [(review, find_urls(review)) for review in reviews['cleaned_comments'] if find_urls(review)]
for comment, urls in comments_with_urls:
    print(f"Comment: {comment}\nURLs: {', '.join(urls)}\n")

In [109]:
# Simply viewing instances where abnormal whitespace might be present
sample_reviews = reviews['cleaned_comments'].sample(100).apply(lambda x: re.sub(r'\s+', ' ', str(x)))
print(sample_reviews)

382539    Perfect location, closed parking and very new ...
230044    Sana's place is great for anyone who is lookin...
449587    Good location with parking in Toronto - a plus...
337264    Comfortable and clean, walking distance from a...
387485    Good location, easy to drive to downtown Toron...
                                ...                        
69859     My friend and I were visiting from Vancouver, ...
230393    cute &clean place.big enough for a short stay ...
57066     Sylvie and Bill’s place is perfectly located n...
72442     It was a good apartment, in a nice district. W...
690       I really enjoyed my stay. Basil's home was com...
Name: cleaned_comments, Length: 100, dtype: object


In [110]:
# lets remove those additonal whitespaces now
# This creates a new column in the 'reviews' DataFrame where all comments are cleaned
reviews['cleaned_comments'] = reviews['cleaned_comments'].apply(lambda x: re.sub(r'\s+', ' ', str(x)).strip())

In [111]:
# Now lets check for language
from langdetect import detect, LangDetectException

def detect_language(text):
    try:
        return detect(text)
    except LangDetectException:
        return 'error'

# Apply the language Detection function to each review - add in new language column
reviews['language'] = reviews['cleaned_comments'].apply(lambda x: detect_language(str(x)))

In [112]:
# Check the distribution of languages
language_distribution = reviews['language'].value_counts()
print(language_distribution)

language
en       460087
fr        17512
es         9962
de         3218
pt         2665
zh-cn      2664
ro         2322
ko         1785
error      1384
so         1338
it          938
af          923
ca          790
nl          717
ja          525
tl          509
cs          490
pl          382
ru          321
zh-tw       249
sw          233
hr          222
no          207
cy          185
id          173
tr          172
sv          161
da          161
hu          144
vi          114
fi           90
uk           78
et           74
sl           72
he           57
sk           53
ar           15
lv           14
el           13
lt           12
sq            9
bg            7
mk            5
th            4
fa            2
hi            1
ur            1
gu            1
Name: count, dtype: int64


In [113]:
# Remove all rows where the language is not English
reviews = reviews[reviews['language'] == 'en']

In [114]:
# Lets Check the distribution of languages again
# Apply the language Detection function to each review - add in new language column
reviews['language'] = reviews['cleaned_comments'].apply(lambda x: detect_language(str(x)))
language_distribution = reviews['language'].value_counts()
print(language_distribution)

language
en    459344
fr       146
ro       142
af        54
hr        43
nl        37
it        36
cs        35
so        31
ca        29
tl        26
es        23
cy        20
pl        15
no        15
da        14
fi        13
pt        12
de        11
hu        11
sw         7
tr         7
sv         5
id         5
et         4
ko         1
sl         1
Name: count, dtype: int64


In [119]:
# Filter out English and 'error' language entries
non_english_reviews = reviews[(reviews['language'] != 'en')]
print(non_english_reviews)

Empty DataFrame
Columns: [listing_id, id, date, reviewer_id, reviewer_name, comments, cleaned_comments, language]
Index: []


In [120]:
# Remove all rows where the language is not English
reviews = reviews[reviews['language'] == 'en']

In [121]:
# Let's correct the date type so it's correct

reviews['date']=pd.to_datetime(reviews['date'])

In [122]:
reviews.info()

<class 'pandas.core.frame.DataFrame'>
Index: 459344 entries, 0 to 511138
Data columns (total 8 columns):
 #   Column            Non-Null Count   Dtype         
---  ------            --------------   -----         
 0   listing_id        459344 non-null  int64         
 1   id                459344 non-null  int64         
 2   date              459344 non-null  datetime64[ns]
 3   reviewer_id       459344 non-null  int64         
 4   reviewer_name     459344 non-null  object        
 5   comments          459344 non-null  object        
 6   cleaned_comments  459344 non-null  object        
 7   language          459344 non-null  object        
dtypes: datetime64[ns](1), int64(3), object(4)
memory usage: 31.5+ MB


In [123]:
reviews.head()

Unnamed: 0,listing_id,id,date,reviewer_id,reviewer_name,comments,cleaned_comments,language
0,1419,38924112,2015-07-19,11308465,Marcela,Having the opportunity of arriving to Alexandr...,Having the opportunity of arriving to Alexandr...,en
1,1419,44791978,2015-08-29,9580285,Marco,We have no enough words to describe how beauty...,We have no enough words to describe how beauty...,en
2,1419,45957133,2015-09-07,38394721,Andrea,The listing was exceptional and an even better...,The listing was exceptional and an even better...,en
3,1419,67295154,2016-03-28,3515044,Shaun,Alexandra's home was amazing and in such a nea...,Alexandra's home was amazing and in such a nea...,en
4,1419,177702208,2017-08-03,13987100,Kate,Beautiful home. Very comfortable and clean. Pe...,Beautiful home. Very comfortable and clean. Pe...,en


In [124]:
reviews = reviews.reset_index(drop=True)

In [43]:
import string

# Function to identify special characters
def find_special_characters(text_series):
    special_chars = defaultdict(int)
    for text in text_series.dropna():
        chars = [char for char in text if char in string.punctuation]
        for char in chars:
            special_chars[char] += 1
    return sorted(special_chars.items(), key=lambda x: x[1], reverse=True)

# Inspect special characters
special_chars = find_special_characters(reviews['comments'])
print("Special characters found:", special_chars)


Special characters found: [('.', 1407322), (',', 718519), ('!', 365904), ("'", 150007), ('-', 100646), (')', 61751), ('(', 44255), ('/', 31051), (':', 28914), ('&', 19837), (';', 9403), ('"', 6226), ('%', 5309), ('+', 3816), ('$', 2851), ('?', 2156), ('*', 1718), ('~', 867), ('^', 343), ('#', 307), ('<', 242), ('=', 234), ('_', 229), ('`', 146), ('>', 142), ('@', 140), (']', 91), ('[', 78), ('\\', 36), ('}', 8), ('{', 7), ('|', 4)]


In [44]:
special_chars = [
    '.', ',', '!', '/', '<', '>', "'", '-', ')', '(', ':', '&', ';', '"', 
    '%', '+', '$', '?', '*', '~', '^', '#', '=', '_', '`', '@', ']', '[', 
    '\\', '}', '{', '|'
]

# Function to display sample reviews containing specific special characters
def display_reviews_with_special_chars(text_series, chars, num_samples=5):
    for char in chars:
        print(f"Reviews containing '{char}':")
        # Find reviews containing the character
        containing_char = text_series[text_series.str.contains(re.escape(char), na=False)]
        # Display a sample of reviews
        for review in containing_char.sample(min(num_samples, len(containing_char))):
            print(f" - {review}\n")
        print("----------------------------------------------------\n")

# Example usage
display_reviews_with_special_chars(reviews['comments'], special_chars)

Reviews containing '.':
 - Great value and location. Very close downtown and lots of options for restaurants. Spacious and clean. We had a great stay!

 - Un grand merci à Damaris et son mari, leur logement est très agréable et parfaitement bien pensé à tous les points de vue. Nous sommes un couple dans la soixantaine et étions venus pour visiter, pendant une semaine, une amie travaillant à l’Alliance Française, au centre ville de Toronto. Situation du logement : dans l’entresol de la maison, comporte trois petites fenêtres, donc cela manque un peu de luminosité naturelle mais de toutes manières nous y avons principalement passé la nuit. Néanmoins, le logement manque nullement de lumière, les luminaires ont été installés de façon intelligente et sont réglables. Il n’y a pas vraiment d’isolation phonique avec les étages supérieurs mais toute la famille s’est montrée très respectueuse, ce qui, au vu de cette situation, est également attendu des visiteurs ! La cohabitation s’est donc fait

Lets save the cleaned dataset for use in the next steps

In [99]:
# Lets save the datset to be used for next steps - save as pickle to preserve datatype changes (date)
reviews.to_pickle('cleaned_reviews_w_lang.csv')