## ANLP Assignment 2: Text Generation
### Shuying Piao,  id: a1912828

In [31]:
import pandas as pd

In [None]:
import pandas as pd

# Load training and test datasets
train_df = pd.read_json('hotel_reviews_train.JSON', lines=True)
test_df = pd.read_json('hotel_reviews_test.JSON', lines=True)

# Rename 'id' to 'review_id' to serve as unique identifier
train_df = train_df.rename(columns={'id': 'review_id'})
test_df = test_df.rename(columns={'id': 'review_id'})


# Expand the 'ratings' field into multiple rating feature columns
train_ratings = train_df['ratings'].apply(pd.Series)
test_ratings = test_df['ratings'].apply(pd.Series)


# Keep all other columns as additional context features (excluding 'title', 'text', 'ratings')
extra_columns = [col for col in train_df.columns if col not in ['text', 'title', 'ratings']]


# Combine review_id, extra context fields, rating features, and the target text
train_processed = pd.concat([train_ratings, train_df[['text']],train_df[extra_columns],], axis=1)
test_processed = pd.concat([ test_ratings, test_df[['text']],test_df[extra_columns]], axis=1)

train_processed = train_processed.set_index('review_id')
test_processed = test_processed.set_index('review_id')

# Preview samples from train and test datasets to confirm structure
print("Train sample:")
display(train_processed.head(5))


Train sample:


Unnamed: 0_level_0,service,cleanliness,overall,value,location,sleep_quality,rooms,check_in_front_desk,business_service_(e_g_internet_access),text,author,date_stayed,offering_id,num_helpful_votes,date,via_mobile
review_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1
140716137,1.0,2.0,1.0,1.0,3.0,2.0,1.0,,,First of all we got there and they didn't have...,"{'username': 'Kh3RD', 'num_reviews': 1, 'id': ...",September 2012,80138,0,2012-09-19,False
114807323,1.0,1.0,1.0,2.0,,1.0,,,,Found Racist graffiti in the room. Request to ...,"{'username': 'TheUglyPhotographer', 'num_citie...",June 2011,240151,1,2011-06-27,False
84805430,4.0,5.0,4.0,3.0,5.0,4.0,4.0,,,Close to Union Square - hotel is a bit of a ma...,"{'username': 'Moonstonemoclips', 'num_cities':...",October 2010,80793,0,2010-10-25,False
132971117,3.0,2.0,2.0,1.0,4.0,1.0,1.0,,,I had a bad vibe about this place from the mom...,"{'username': 'JoanellenJ', 'num_cities': 10, '...",June 2012,111418,1,2012-06-28,False
124104157,,,1.0,,,,,,,"After we received our ""final"" bill and left th...","{'username': 'Lynnworks', 'num_cities': 3, 'nu...",January 2012,671150,3,2012-02-04,False


In [45]:
train_processed['author'].iloc[1]

{'username': 'TheUglyPhotographer',
 'num_cities': 3,
 'num_helpful_votes': 4,
 'num_reviews': 4,
 'num_type_reviews': 4,
 'id': 'BB116F87FE8F9AB356F63853BFD32FFE',
 'location': 'Oceanside, California'}

In [34]:
# Check for missing values to ensure data quality
print("Missing values in training set:")
print(train_processed.isnull().sum())

print("\nMissing values in test set:")
print(test_processed.isnull().sum())

Missing values in training set:
author                                        0
date_stayed                                 950
offering_id                                   0
num_helpful_votes                             0
date                                          0
review_id                                     0
via_mobile                                    0
service                                    2450
cleanliness                                2455
overall                                       0
value                                      2453
location                                   5017
sleep_quality                              9815
rooms                                      4000
check_in_front_desk                       21091
business_service_(e_g_internet_access)    22041
text                                          0
dtype: int64

Missing values in test set:
author                                       0
date_stayed                                244
offering_id     

In [35]:
# Check missing value ratio in training set and drop columns with >70% missing
train_missing_ratio = train_processed.isnull().mean()
train_processed = train_processed.loc[:, train_missing_ratio <= 0.7]

# Check missing value ratio in test set and drop columns with >70% missing
test_missing_ratio = test_processed.isnull().mean()
test_processed = test_processed.loc[:, test_missing_ratio <= 0.7]

# Show removed columns
removed_train_cols = train_missing_ratio[train_missing_ratio > 0.7].index.tolist()
removed_test_cols = test_missing_ratio[test_missing_ratio > 0.7].index.tolist()

print("Columns removed from training set due to >70% missing:")
print(removed_train_cols)

print("\nColumns removed from test set due to >70% missing:")
print(removed_test_cols)

# Final missing value check
print("\nRemaining missing values in training set:")
print(train_processed.isnull().sum())

print("\nRemaining missing values in test set:")
print(test_processed.isnull().sum())

Columns removed from training set due to >70% missing:
['check_in_front_desk', 'business_service_(e_g_internet_access)']

Columns removed from test set due to >70% missing:
['check_in_front_desk', 'business_service_(e_g_internet_access)']

Remaining missing values in training set:
author                  0
date_stayed           950
offering_id             0
num_helpful_votes       0
date                    0
review_id               0
via_mobile              0
service              2450
cleanliness          2455
overall                 0
value                2453
location             5017
sleep_quality        9815
rooms                4000
text                    0
dtype: int64

Remaining missing values in test set:
author                  0
date_stayed           244
offering_id             0
num_helpful_votes       0
date                    0
review_id               0
via_mobile              0
service               649
cleanliness           642
overall                 0
value           

In [46]:
import re

def clean_text_for_generation(text):
    if not isinstance(text, str):
        return ""

    text = re.sub(r'[^a-z0-9.,!?\'\";:\-\s]', '', text)  #keep English chars & punctuation
    text = text.lower()  # Convert to lowercase for easier word vector matching
    text = re.sub(r'<.*?>', '', text)  
    text = re.sub(r'[^\x00-\x7F]+', '', text)  
    text = re.sub(r'\s+', ' ', text)  

In [47]:
train_processed['text'] = train_processed['text'].apply(clean_text_for_generation)
test_processed['text'] = test_processed['text'].apply(clean_text_for_generation)