In [4]:
import pandas as pd
from sklearn.preprocessing import MinMaxScaler
import re

# Load dataset
input_file = "destinasi-wisata-indonesia.xlsx"
output_file = "destinasi-wisata-indonesia-preprocessed.xlsx"
data = pd.read_excel(input_file)

# Drop irrelevant columns
irrelevant_columns = ['coordinate', 'lat', 'long']
data = data.drop(columns=irrelevant_columns, errors='ignore')

# Handle missing values
data.fillna({
    'Price': data['Price'].median() if 'Price' in data.columns else 0,
    'Rating': data['Rating'].median() if 'Rating' in data.columns else 0,
    'Description': ''
}, inplace=True)

# Normalize numeric columns
scaler = MinMaxScaler()
numeric_columns = ['Price', 'Rating']
for column in numeric_columns:
    if column in data.columns:
        data[f'Normalized_{column}'] = scaler.fit_transform(data[[column]])

# Text preprocessing for "Description" column
def preprocess_text(text):
    text = text.lower()  # Lowercase
    text = re.sub(r'[^\w\s]', '', text)  # Remove punctuation
    text = re.sub(r'\s+', ' ', text).strip()  # Remove extra spaces
    return text

data['Processed_Description'] = data['Description'].apply(preprocess_text)

# Save the processed data
data.to_excel(output_file, index=False)
print(f"Preprocessed data saved to {output_file}")

Preprocessed data saved to destinasi-wisata-indonesia-preprocessed.xlsx
