### Import Data

In [39]:
import pandas as pd

NYC = pd.read_csv('NYC_reviews.csv')
HKC = pd.read_csv('HKC_reviews.csv')
SGD = pd.read_csv('SGD_reviews.csv')

In [15]:
NYC.head()


Unnamed: 0,listing_id,id,date,reviewer_id,reviewer_name,comments
0,2595,17857,2009-11-21,50679,Jean,Notre séjour de trois nuits.\r<br/>Nous avons ...
1,2595,19176,2009-12-05,53267,Cate,Great experience.
2,2595,19760,2009-12-10,38960,Anita,I've stayed with my friend at the Midtown Cast...
3,2595,34320,2010-04-09,71130,Kai-Uwe,"We've been staying here for about 9 nights, en..."
4,2595,46312,2010-05-25,117113,Alicia,We had a wonderful stay at Jennifer's charming...


In [16]:
NYC.shape

(983621, 6)

In [44]:
SGD['date'] = pd.to_datetime(SGD['date']).dt.date
NYC['date'] = pd.to_datetime(HKC['date']).dt.date
HKC['date'] = pd.to_datetime(NYC['date']).dt.date

In [45]:
NYC.dtypes

listing_id        int64
id                int64
date             object
reviewer_id       int64
reviewer_name    object
comments         object
dtype: object

In [19]:
HKC.head()

Unnamed: 0,listing_id,id,date,reviewer_id,reviewer_name,comments
0,17891,37352,2010-04-23,76132,Tamara,The apartment on Holly wood Rd was exactly as ...
1,17891,56688,2010-06-24,97136,Wendy,"Apartment was perfect - stylish, spacious and..."
2,17891,76243,2010-08-09,163312,Angela,We loved staying here for the weekend. The apa...
3,17891,93590,2010-09-09,148333,Jennifer,An awesome loft that looks just like it does i...
4,17891,108082,2010-09-29,218702,Matt,"Candace was great, she prepared the loft with ..."


In [20]:
HKC.shape

(95386, 6)

In [21]:
SGD.head()

Unnamed: 0,listing_id,id,date,reviewer_id,reviewer_name,comments
0,71609,793880,2011-12-19,1456140,Max,The rooms were clean and tidy. Beds very comfo...
1,3209752,523972161,2019-09-05,20987832,Eng Hui,"Thank you for making me feel at home, it is su..."
2,3209752,527054740,2019-09-10,3963956,Lug,"An absolutely amazing place. It’s clean, styli..."
3,3209752,528926785,2019-09-14,66599486,Andrew,This location is private yet it is only 400m w...
4,3209752,538619865,2019-09-30,110050140,Damien,Jeremy is a great host that will make your sta...


In [22]:
SGD.shape

(35698, 6)

### Combinne the Data

In [49]:
import pandas as pd

SGD['Country'] = 'Singapore'
NYC['Country'] = 'New York'
HKC['Country'] = 'Hong Kong'

combined_df = pd.concat([SGD, NYC, HKC], ignore_index=True)

combined_df.head()

Unnamed: 0,listing_id,id,date,reviewer_id,reviewer_name,comments,Country
0,71609,793880,2011-12-19,1456140,Max,The rooms were clean and tidy. Beds very comfo...,Singapore
1,3209752,523972161,2019-09-05,20987832,Eng Hui,"Thank you for making me feel at home, it is su...",Singapore
2,3209752,527054740,2019-09-10,3963956,Lug,"An absolutely amazing place. It’s clean, styli...",Singapore
3,3209752,528926785,2019-09-14,66599486,Andrew,This location is private yet it is only 400m w...,Singapore
4,3209752,538619865,2019-09-30,110050140,Damien,Jeremy is a great host that will make your sta...,Singapore


In [50]:
combined_df.shape

(1114705, 7)

In [51]:
combined_df.head()

Unnamed: 0,listing_id,id,date,reviewer_id,reviewer_name,comments,Country
0,71609,793880,2011-12-19,1456140,Max,The rooms were clean and tidy. Beds very comfo...,Singapore
1,3209752,523972161,2019-09-05,20987832,Eng Hui,"Thank you for making me feel at home, it is su...",Singapore
2,3209752,527054740,2019-09-10,3963956,Lug,"An absolutely amazing place. It’s clean, styli...",Singapore
3,3209752,528926785,2019-09-14,66599486,Andrew,This location is private yet it is only 400m w...,Singapore
4,3209752,538619865,2019-09-30,110050140,Damien,Jeremy is a great host that will make your sta...,Singapore


### Check for Null Value and Inconsistency

In [53]:
duplicates = combined_df.duplicated(subset='id', keep=False)

has_duplicates = duplicates.any()

print(has_duplicates)


False


In [35]:
unique_counts = combined_df.groupby('reviewer_id')['reviewer_name'].nunique()
a_determines_b = (unique_counts == 1).all()
print(a_determines_b)  # Output: True if 'reviewer_id' determines 'reviewer_name', False otherwise


True


In [36]:
# Find the reviewer_ids with more than one associated reviewer_name
multiple_names_ids = combined_df.groupby('reviewer_id')['reviewer_name'].nunique() > 1

# Filter out the ids with multiple names
multiple_names_ids = multiple_names_ids[multiple_names_ids]

# Now, let's see the data for these ids
discrepancies = combined_df[combined_df['reviewer_id'].isin(multiple_names_ids.index)]

# Sort the data by reviewer_id for easier inspection
discrepancies = discrepancies.sort_values(by='reviewer_id')

# Display the data
print(discrepancies)


Empty DataFrame
Columns: [listing_id, id, date, reviewer_id, reviewer_name, comments, Country]
Index: []


In [37]:
# Group by 'reviewer_id' and count unique 'reviewer_name' values
unique_counts = combined_df.groupby('reviewer_id')['reviewer_name'].nunique()

# Find 'reviewer_id' values with more than one unique 'reviewer_name'
multiple_names_ids = unique_counts[unique_counts > 1]

# Display the 'reviewer_id' values and the count of unique 'reviewer_name' values
print(multiple_names_ids)


Series([], Name: reviewer_name, dtype: int64)


In [38]:
# Check for null values in the 'reviewer_id' and 'reviewer_name' columns
null_counts = combined_df[['reviewer_id', 'reviewer_name']].isnull().sum()

# Output the count of null values in each column
print(null_counts)


reviewer_id      0
reviewer_name    0
dtype: int64


### Format the "comments" Column

In [65]:
# Convert the 'column_name' to string type
combined_df['comments'] = combined_df['comments'].astype(str)


In [71]:
type(combined_df['comments'][100])

str

In [72]:
import pandas as pd
import re

# Assuming df is your DataFrame and 'comments' is the column with the HTML content
combined_df['comments'] = combined_df['comments'].replace(to_replace=r'<br\s*\/?>', value='\n', regex=True)


In [73]:
combined_df['comments']

0          The rooms were clean and tidy. Beds very comfo...
1          Thank you for making me feel at home, it is su...
2          An absolutely amazing place. It’s clean, styli...
3          This location is private yet it is only 400m w...
4          Jeremy is a great host that will make your sta...
                                 ...                        
1114700    先說總體感受，實在是太棒了！無論是房子還是房東 Coco，都是完美的！\n房子位置在油麻地，...
1114701    This apartment is only 3 mins walk from the Ya...
1114702    Awesome stay with Forrest. What a great guy. P...
1114703    卫生：很干净\n实用小贴士：楼下有按摩店累了可以去按摩放松一下\n沟通：房东很友好，阳光帅气...
1114704                                   房主人很好，交代清晰及順利，推薦👍🏻
Name: comments, Length: 1114705, dtype: object

### Remove the "Country" Column to avoid repetitive information

In [93]:
combined_df = combined_df.drop(columns=['Country'])

In [94]:
combined_df.head()

Unnamed: 0,listing_id,rid,date,reviewer_id,reviewer_name,comments
0,71609,793880,2011-12-19,1456140,Max,The rooms were clean and tidy. Beds very comfo...
1,3209752,523972161,2019-09-05,20987832,Eng Hui,"Thank you for making me feel at home, it is su..."
2,3209752,527054740,2019-09-10,3963956,Lug,"An absolutely amazing place. It’s clean, styli..."
3,3209752,528926785,2019-09-14,66599486,Andrew,This location is private yet it is only 400m w...
4,3209752,538619865,2019-09-30,110050140,Damien,Jeremy is a great host that will make your sta...


### Format the "date" Column

In [102]:
import pandas as pd

# Assuming df is your DataFrame

# Convert 'date' column to datetime
combined_df['date'] = pd.to_datetime(combined_df['date'])

# Filter out the DataFrame for reviews after 2019
combined_df = combined_df[combined_df['date'].dt.year > 2019]
combined_df

Unnamed: 0,listing_id,rid,date,reviewer_id,reviewer_name,comments
12,3209752,611063059,2020-02-26,51293660,Mark,Clean and tidy room in shared apartment in Cha...
13,3209752,613450484,2020-03-02,932827,Joanne,"Classy, stylish, and private - Jeremy's condo ..."
62,71609,593817449,2020-01-17,29197511,Elton,Belinda was really welcoming and the checking ...
274,71903,590602364,2020-01-09,228835457,도현,공항에 늦게 도착해 다른 곳으로 이동하기 전 하루를 묵었습니다. 의사소통도 문제 없...
306,275343,622610014,2020-04-17,215743044,Sophie,"Joyce is a very kind host, who always does her..."
...,...,...,...,...,...,...
1114700,974317387304888150,976436841075046364,2023-09-09,173181908,Dennis,先說總體感受，實在是太棒了！無論是房子還是房東 Coco，都是完美的！\n房子位置在油麻地，...
1114701,974317387304888150,977992826740849029,2023-09-11,206028952,Wallace,This apartment is only 3 mins walk from the Ya...
1114702,968032750772868752,972842079243299183,2023-09-04,527019129,Jamie,Awesome stay with Forrest. What a great guy. P...
1114703,968032750772868752,980124618059896949,2023-09-14,197973683,Kylin,卫生：很干净\n实用小贴士：楼下有按摩店累了可以去按摩放松一下\n沟通：房东很友好，阳光帅气...


### Export the Data

In [103]:
combined_df.to_csv('review_preprocessed.csv', index=False)

In [104]:
listing_id_df = combined_df['listing_id']

In [105]:
listing_id_df = listing_id_df.drop_duplicates()

In [106]:
listing_id_df

12                    3209752
62                      71609
274                     71903
306                    275343
404                    275344
                  ...        
1114697    965065707407457243
1114699    967767809912510839
1114700    974317387304888150
1114702    968032750772868752
1114704    969283944768308420
Name: listing_id, Length: 5007, dtype: int64

In [107]:
csv_file_path = 'listing_id.csv'
listing_id_df.to_csv(csv_file_path, index=False)

In [108]:
combined_df.head()

Unnamed: 0,listing_id,rid,date,reviewer_id,reviewer_name,comments
12,3209752,611063059,2020-02-26,51293660,Mark,Clean and tidy room in shared apartment in Cha...
13,3209752,613450484,2020-03-02,932827,Joanne,"Classy, stylish, and private - Jeremy's condo ..."
62,71609,593817449,2020-01-17,29197511,Elton,Belinda was really welcoming and the checking ...
274,71903,590602364,2020-01-09,228835457,도현,공항에 늦게 도착해 다른 곳으로 이동하기 전 하루를 묵었습니다. 의사소통도 문제 없...
306,275343,622610014,2020-04-17,215743044,Sophie,"Joyce is a very kind host, who always does her..."


In [109]:
reviewer_id_df = combined_df['reviewer_id']

In [110]:
reviewer_id_df = reviewer_id_df.drop_duplicates()

In [111]:
reviewer_id_df

12          51293660
13            932827
62          29197511
274        228835457
306        215743044
             ...    
1114698    202688118
1114699    159441318
1114700    173181908
1114701    206028952
1114704     69937735
Name: reviewer_id, Length: 74202, dtype: int64

In [112]:
csv_file_path = 'reviewer_id.csv'
reviewer_id_df.to_csv(csv_file_path, index=False)

In [114]:
reviewer_id_name_df = combined_df[['reviewer_id','reviewer_name']]
reviewer_id_name_df = reviewer_id_name_df.drop_duplicates()
reviewer_id_name_df

Unnamed: 0,reviewer_id,reviewer_name
12,51293660,Mark
13,932827,Joanne
62,29197511,Elton
274,228835457,도현
306,215743044,Sophie
...,...,...
1114698,202688118,Tito
1114699,159441318,宇
1114700,173181908,Dennis
1114701,206028952,Wallace


In [115]:
csv_file_path = 'reviewer_id_name.csv'
reviewer_id_name_df.to_csv(csv_file_path, index=False)