In [9]:
#Data collection and Preprocessing

In [86]:
import pandas as pd
import re
import numpy as np
from selenium import webdriver
from selenium.common.exceptions import NoSuchElementException
from selenium.webdriver.common.by import By
import time

def scrape_page(driver):
    # Find all activity elements on the page
    activities = driver.find_elements(By.XPATH, '//*[@id="ottd-smart-platform"]/section/div[2]/div[3]/div[2]/div/div/div[2]/div[2]/ul/li/a/div[5]')

    # Create lists to store information
    names = []
    reviews = []
    attraction_types = []
    distances = []

    # Iterate over each activity element and extract relevant information
    for activity in activities:
        try:
            name = activity.find_element(By.XPATH, './/div[1]').text
        except NoSuchElementException:
            name = None

        try:
            review = activity.find_element(By.XPATH, './/div[2]').text
            review = review.split("\n")[1]
        except NoSuchElementException:
            review = None

        try:
            # Assuming the attraction type is nested within a div with class 'tags-box'
            attraction_type_element = activity.find_element(By.XPATH, './/div[@class="tags-box"]')
            attraction_type = attraction_type_element.text
        except NoSuchElementException:
            attraction_type = None

        try:
            distance = activity.find_element(By.XPATH, './/div[5]').text
        except NoSuchElementException:
            distance = None

        # Append data only if available
        if name is not None:
            names.append(name)
        if review is not None:
            reviews.append(review)
        if attraction_type is not None:
            attraction_types.append(attraction_type)
        if distance is not None:
            distances.append(distance)

    # Create a list of tuples containing the scraped data
    data = list(zip(names, reviews, attraction_types, distances))
    return data

def classify_attraction_type(types):
    categories = {
        'park': ['parks', 'gardens', 'reserves'],
        'market': ['markets'],
        'modern architecture': ['modern architectures'],
        'historical site': ['historical sites', 'historical', 'memorial', 'cemeteries', 'temples'],
        'cinema': ['cinema'],
        'museum': ['museum', 'galleries', 'art', 'exhibition'],
        'forest': ['forest'],
        'stadium': ['stadium'],
        'mountain': ['mountain'],
        'waterfall': ['waterfall'],
        'church': ['church', 'cathedral']
    }
    
    result_set = set()
    for category, keywords in categories.items():
        for keyword in keywords:
            if any(keyword in type_str.lower() for type_str in types.split('\n')):
                result_set.add(category)
    return ', '.join(result_set) if result_set else 'other tourist attraction'

def extract_distance(distance):
    numbers = re.findall(r'\d+\.\d+|\d+', distance)
    return float(numbers[0]) if numbers else None

# Set up the Chrome driver
driver = webdriver.Chrome()

# Navigate to the website
driver.get('https://www.trip.com/things-to-do/list-433/city?citytype=dt&id=433&name=Gwangju&keyword=&pshowcode=Ticket2&locale=en-US&curr=USD')

# Wait for the page to load
time.sleep(5)

# Scrape data from the first page
data = scrape_page(driver)

# Loop through pages 2 to 16
for page_num in range(2, 17):
    # Find the element corresponding to the link to the current page
    page_link = driver.find_element(By.XPATH, f'//a[text()="{page_num}"]')

    # Click on the link to navigate to the current page
    page_link.click()

    # Wait for the page to load
    time.sleep(5)

    # Scrape data from the current page and extend it to the existing data
    data.extend(scrape_page(driver))

# Close the browser
driver.quit()

# Convert the combined data to a DataFrame
df = pd.DataFrame(data, columns=['Name', 'Review', 'Attraction Type', 'Distance'])

# Apply distance extraction and categorization
df['Distance'] = df['Distance'].apply(extract_distance)
df['Categorized Type'] = df['Attraction Type'].apply(classify_attraction_type)

# Print the DataFrame
print(df)


                                        Name Review  \
0              1913 Songjeong Station Market    3.4   
1           The Gwangju Ecological Lake Park    3.2   
2             Kim Dae Jung Convention Center    3.2   
3                    U-Square Culture Centre    3.2   
4                          Uncheon Reservoir    3.0   
5                            Penguin Village    3.0   
6                                   CGV 광주첨단    2.9   
7                              Uchi Park Zoo    2.9   
8                         CGV Gwangju Sangmu    2.9   
9                         LOTTE Cinema Suwan    2.8   
10                          Megabox Cheomdan    2.8   
11  May 18 Democratization Movement Archives    2.8   
12                       Gwangju Family Land    2.7   
13                              Lotte Cinema    2.7   
14                Jeungsimsa Buddhist Temple    2.7   
15                           Suwan Lake Park    2.6   
16                     May 18th Liberty Park    2.6   
17        

In [87]:
df = df.drop('Attraction Type', axis=1)
print(df)

                                        Name Review  Distance  \
0              1913 Songjeong Station Market    3.4       6.0   
1           The Gwangju Ecological Lake Park    3.2      13.8   
2             Kim Dae Jung Convention Center    3.2       1.8   
3                    U-Square Culture Centre    3.2       7.2   
4                          Uncheon Reservoir    3.0       4.3   
5                            Penguin Village    3.0       6.3   
6                                   CGV 광주첨단    2.9       1.3   
7                              Uchi Park Zoo    2.9       4.0   
8                         CGV Gwangju Sangmu    2.9       5.7   
9                         LOTTE Cinema Suwan    2.8       3.6   
10                          Megabox Cheomdan    2.8       3.4   
11  May 18 Democratization Movement Archives    2.8     600.0   
12                       Gwangju Family Land    2.7      11.2   
13                              Lotte Cinema    2.7       4.4   
14                Jeungsi

In [88]:
# Calculate counts
count = df['Categorized Type'].value_counts()

# Map counts to the original data
df['Categorized Type Count'] = df['Categorized Type'].map(count)

print(df)

                                        Name Review  Distance  \
0              1913 Songjeong Station Market    3.4       6.0   
1           The Gwangju Ecological Lake Park    3.2      13.8   
2             Kim Dae Jung Convention Center    3.2       1.8   
3                    U-Square Culture Centre    3.2       7.2   
4                          Uncheon Reservoir    3.0       4.3   
5                            Penguin Village    3.0       6.3   
6                                   CGV 광주첨단    2.9       1.3   
7                              Uchi Park Zoo    2.9       4.0   
8                         CGV Gwangju Sangmu    2.9       5.7   
9                         LOTTE Cinema Suwan    2.8       3.6   
10                          Megabox Cheomdan    2.8       3.4   
11  May 18 Democratization Movement Archives    2.8     600.0   
12                       Gwangju Family Land    2.7      11.2   
13                              Lotte Cinema    2.7       4.4   
14                Jeungsi

In [90]:
df = df.drop('Categorized Type', axis=1)
df = df.drop('Categorized Type Frequency', axis=1)
print(df)

                                        Name Review  Distance  \
0              1913 Songjeong Station Market    3.4       6.0   
1           The Gwangju Ecological Lake Park    3.2      13.8   
2             Kim Dae Jung Convention Center    3.2       1.8   
3                    U-Square Culture Centre    3.2       7.2   
4                          Uncheon Reservoir    3.0       4.3   
5                            Penguin Village    3.0       6.3   
6                                   CGV 광주첨단    2.9       1.3   
7                              Uchi Park Zoo    2.9       4.0   
8                         CGV Gwangju Sangmu    2.9       5.7   
9                         LOTTE Cinema Suwan    2.8       3.6   
10                          Megabox Cheomdan    2.8       3.4   
11  May 18 Democratization Movement Archives    2.8     600.0   
12                       Gwangju Family Land    2.7      11.2   
13                              Lotte Cinema    2.7       4.4   
14                Jeungsi

In [56]:
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error


In [96]:
# Assuming df['Review'] contains ratings from 1 to 5
# Assuming 'Name' is a column in df containing the names of the places
X = df[['Distance', 'Categorized Type Count']]
y = df['Review']
names = df['Name']  # Ensure this column exists and is named correctly

# Map distance to category
def map_distance_to_category(distance):
    if 0 <= distance <= 50:
        return 1
    elif 50 < distance <= 300:
        return 2
    elif 300 < distance <= 814:
        return 3
    else:
        return None

# Apply distance categorization
categorized_distance = df['Distance'].apply(map_distance_to_category)
# Replace 'Distance' column with categorized distances
df['Distance'] = categorized_distance

print(df)

                                        Name Review  Distance  \
0              1913 Songjeong Station Market    3.4         1   
1           The Gwangju Ecological Lake Park    3.2         1   
2             Kim Dae Jung Convention Center    3.2         1   
3                    U-Square Culture Centre    3.2         1   
4                          Uncheon Reservoir    3.0         1   
5                            Penguin Village    3.0         1   
6                                   CGV 광주첨단    2.9         1   
7                              Uchi Park Zoo    2.9         1   
8                         CGV Gwangju Sangmu    2.9         1   
9                         LOTTE Cinema Suwan    2.8         1   
10                          Megabox Cheomdan    2.8         1   
11  May 18 Democratization Movement Archives    2.8         3   
12                       Gwangju Family Land    2.7         1   
13                              Lotte Cinema    2.7         1   
14                Jeungsi

In [97]:
from sklearn.svm import SVR
# Split data while keeping names aligned with X and y
X_train, X_test, y_train, y_test, names_train, names_test = train_test_split(
    X, y, names, test_size=0.2, random_state=42
)

# Scaling features (important for SVM)
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# Create and train the SVR model with a linear kernel
svr = SVR(kernel='linear')
svr.fit(X_train_scaled, y_train)

# Make predictions
y_pred_svr = svr.predict(X_test_scaled)

# Calculate Mean Squared Error
mse_svr = mean_squared_error(y_test, y_pred_svr)
print(f"Mean Squared Error (SVR): {mse_svr}")



Mean Squared Error (SVR): 0.3738228343589142


In [98]:
predictions_df_svr = pd.DataFrame({
    'Name': names_test,
    'SVR Predicted Review': y_pred_svr
})

print(predictions_df_svr.head())

                             Name  SVR Predicted Review
0   1913 Songjeong Station Market              2.359948
5                 Penguin Village              2.600572
34       Naejangsan National Park              3.001974
13                   Lotte Cinema              2.600715
44             Songgwangsa Temple              2.998939


In [100]:
import pickle
# Save the trained Random Forest classifier to a file
filename = 'train_model.sav'
with open(filename, 'wb') as file:
    pickle.dump(svr, file)