### 1. read JSON file(test data) and select the reviews from one hotel and convert it to a dataframe


In [32]:
import pandas as pd

test_data_path = 'scraped_data_5.json' # test data json file path

# String to make the review text invalid
# Positive_Review: invalid reviews are with the following strings:
test_pos_invalid_content = 'there are no comments available for this review|everything'
# Negative_Review: invalid reviews are with the following strings:
test_neg_invalid_content = 'nothing|n/a|none'

# Read JSON file into a DataFrame
with open(test_data_path, 'r') as f:
    df = pd.read_json(f)


In [33]:

# a function to get all reviews from one hotel
def get_hotel_reviews(df, hotel_name):
    # Get the row of the DataFrame where the 'business_name' column is equal to the hotel_name
    hotel_df = df[df['business_name'] == hotel_name]

    # Get the review list from that hotel: reviews column first row
    reviews = hotel_df['reviews'].iloc[0]

    # Merge all rows of the 'reviews' column into one list
    reviews_list = [review for review in reviews]

    # Create a DataFrame from the list of dictionaries
    review_df = pd.DataFrame(reviews_list)

    #only keep 3 columns we needed, and rename the columns 'Negative_Review', 'Positive_Review', 'Review_Date',
    reviews_df_clean = review_df[['review_date', 'review_liked', 'review_disliked']]
    reviews_df_clean = reviews_df_clean.rename(columns={'review_date': 'Review_Date', 'review_liked': 'Positive_Review', 'review_disliked': 'Negative_Review'})

    # Add the 'Hotel_Name' column to the DataFrame
    reviews_df_clean['Hotel_Name'] = hotel_name

    # Move the 'Hotel_Name' column to the front of the DataFrame
    hotel_name_col = reviews_df_clean.pop('Hotel_Name')
    reviews_df_clean.insert(0, 'Hotel_Name', hotel_name_col)

    print(f"Created DataFrame with {len(reviews_df_clean)} rows.")
    # Return the DataFrame of reviews
    return reviews_df_clean


In [34]:
### Test the function ###
# Get the reviews for the 'lyf Farrer Park Singapore' hotel
review_df = get_hotel_reviews(df, 'lyf Farrer Park Singapore')
review_df.head()

Created DataFrame with 1623 rows.


Unnamed: 0,Hotel_Name,Review_Date,Positive_Review,Negative_Review
0,lyf Farrer Park Singapore,"May 26, 2023","The ease of the stay, from check in to check o...",Having to leave 😭
1,lyf Farrer Park Singapore,"June 24, 2023","Great location, supermarket in front and a lot...","The neighbors were to noisy at midnight, screa..."
2,lyf Farrer Park Singapore,"June 22, 2023",Location and gym.,"Noisy at night. Some students stayed over, but..."
3,lyf Farrer Park Singapore,"June 22, 2023","Room is big, great view.",The bathroom layout gives no privacy at all. N...
4,lyf Farrer Park Singapore,"June 22, 2023",Location is excellent. Well withing the reach ...,I think the frequency and quality of cleaning ...


In [53]:
review_df['Negative_Review'].iloc[3]

'The bathroom layout gives no privacy at all. No place to hang clothes when showering. Not a good room if you are sharing with another but I am staying alone not a problem.'

In [35]:
# define a function to loop through the hotel to get the reviews for a df from hotel json file 
def get_all_hotel_reviews(df):
    # get all the business names(hotel name) to a list
    business_names = df['business_name'].tolist()

    result_df= pd.DataFrame()
    for hotel_name in business_names:

        #get the reviews for the hotel
        review_df = get_hotel_reviews(df, hotel_name)

        # concatenate the two dataframes along the rows
        result_df = pd.concat([result_df, review_df])
        print(result_df.tail(2))
    
    return result_df

  


In [None]:
### test the function ###
result_df = get_all_hotel_reviews(df)  
result_df


### 2. clean the review text for testing data

In [60]:
from langdetect import detect

def detect_english(text):
    try:
        return detect(text) == 'en'
    except:
        return False


In [74]:
detect_english('Room is big, great view.')

False

In [58]:
!pip install -U textblob

Collecting textblob
  Downloading textblob-0.17.1-py2.py3-none-any.whl (636 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m636.8/636.8 kB[0m [31m8.0 MB/s[0m eta [36m0:00:00[0m00:01[0m00:01[0m
Installing collected packages: textblob
Successfully installed textblob-0.17.1


In [59]:
# Import the TextBlob class from the textblob library
from textblob import TextBlob

# Define the input text
text = "это компьютерный портал для гиков. It was a beautiful day ."

# Create a TextBlob object from the input text
lang = TextBlob(text)

# Detect the language of the text using the detect_language method
language = lang.detect_language()

# Print the detected language
print(language)

HTTPError: HTTP Error 400: Bad Request

In [70]:
import numpy as np
from langdetect import detect


# define a function to check if a string is in English
def is_english(text):
    if not text: return True # an empty string is considered English
    try:
        lang = detect(text)
        return lang == 'en'
    except:
        return False

# define a function to do basic clean of the review text
def clean_text(review_df, column_name, invalid_content_str):
    # create the docstring for this function
    """
    # Input:
    # the review_df with 4 columns: 'Negative_Review', 'Positive_Review', 'Review_Date', 'Hotel_Name'
    # column_name: 'Negative_Review' or 'Positive_Review'
    # 
    # Output: 
    # a DataFrame with the same columns as review_df, but with invalid reviews removed
    # """

    # filter out non-English reviews from the column
    english_reviews = review_df[review_df[column_name].apply(is_english)]
    english_reviews

    # convert to lowercase
    english_reviews[column_name] = english_reviews[column_name].str.lower()

    # apply strip() to remove leading and trailing whitespaces
    english_reviews[column_name] = english_reviews[column_name].apply(lambda x: x.strip())

    # if the text contain invalid_content_str, replace the text with NAN
    english_reviews.loc[english_reviews[column_name].str.contains(invalid_content_str, case=False), column_name] = np.nan

    # if the text is empty, replace the text with NAN
    english_reviews.loc[english_reviews[column_name] == '', column_name] = np.nan  

    return english_reviews

# define a function to clean the review text from one hotel dateframe(testing data)
def clean_hotel_reviews(review_df, test_pos_invalid_content, test_neg_invalid_content):
    # clean the positive reviews
    print(review_df['Negative_Review'].iloc[3])
    cleaned_pos_df = clean_text(review_df, 'Positive_Review', test_pos_invalid_content)
    print(cleaned_pos_df['Negative_Review'].iloc[3])
    # clean the negative reviews
    cleaned_neg_df = clean_text(cleaned_pos_df, 'Negative_Review', test_neg_invalid_content)
    print(cleaned_neg_df['Negative_Review'].iloc[3])
    return cleaned_neg_df

    



In [71]:
### test the function ###
cleaned_review_df = clean_hotel_reviews(review_df.head(6), test_pos_invalid_content, test_neg_invalid_content)
cleaned_review_df

The bathroom layout gives no privacy at all. No place to hang clothes when showering. Not a good room if you are sharing with another but I am staying alone not a problem.
I think the frequency and quality of cleaning could be better.
i think the frequency and quality of cleaning could be better.


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  english_reviews[column_name] = english_reviews[column_name].str.lower()
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  english_reviews[column_name] = english_reviews[column_name].apply(lambda x: x.strip())


Unnamed: 0,Hotel_Name,Review_Date,Positive_Review,Negative_Review
0,lyf Farrer Park Singapore,"May 26, 2023","the ease of the stay, from check in to check o...",having to leave 😭
1,lyf Farrer Park Singapore,"June 24, 2023","great location, supermarket in front and a lot...","the neighbors were to noisy at midnight, screa..."
2,lyf Farrer Park Singapore,"June 22, 2023",location and gym.,"noisy at night. some students stayed over, but..."
4,lyf Farrer Park Singapore,"June 22, 2023",location is excellent. well withing the reach ...,i think the frequency and quality of cleaning ...


In [72]:
review_df['Positive_Review'].iloc[3]


'Room is big, great view.'

In [40]:
cleaned_review_df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 3014 entries, 1 to 1622
Data columns (total 4 columns):
 #   Column           Non-Null Count  Dtype 
---  ------           --------------  ----- 
 0   Hotel_Name       3014 non-null   object
 1   Review_Date      3014 non-null   object
 2   Positive_Review  1063 non-null   object
 3   Negative_Review  750 non-null    object
dtypes: object(4)
memory usage: 117.7+ KB


### 4. Save the df to pickle file

In [41]:
import pickle
# define a function to save the cleaned dataframe to a pickle file
def save_to_pickle(df, file_name):    
    with open(f'{file_name}.pkl', 'wb') as file:
        pickle.dump(df, file)


    

In [42]:
#save the cleaned reviews to a pickle file -'cleaned_test_data_5.pkl'
save_to_pickle(result_df, 'cleaned_test_data_5')

In [43]:
# open the pickle file for reading
with open('cleaned_test_data_5.pkl', 'rb') as file:
    # load the data from the file
    cleaned_test_data = pickle.load(file)

# print the loaded data
cleaned_test_data

Unnamed: 0,Hotel_Name,Review_Date,Positive_Review,Negative_Review
0,"Holiday Inn Express Singapore Katong, an IHG H...","June 17, 2023","Customer Service, Accommodation, facilities ...",Nil
1,"Holiday Inn Express Singapore Katong, an IHG H...","June 24, 2023","The room was cozy, the staff were friendly and...","The day curtain could not close completely, me..."
2,"Holiday Inn Express Singapore Katong, an IHG H...","June 24, 2023",The hotel was comfortable with quality beds an...,Nothing
3,"Holiday Inn Express Singapore Katong, an IHG H...","June 24, 2023",Just awesome. \nI was in trouble to printed ou...,
4,"Holiday Inn Express Singapore Katong, an IHG H...","June 24, 2023","The breakfast was OK. However, it was botherso...",
...,...,...,...,...
1618,lyf Farrer Park Singapore,"March 22, 2023",There are no comments available for this review,
1619,lyf Farrer Park Singapore,"March 6, 2023",There are no comments available for this review,
1620,lyf Farrer Park Singapore,"January 26, 2023",There are no comments available for this review,
1621,lyf Farrer Park Singapore,"December 20, 2022",There are no comments available for this review,
