In [4]:
import pandas as pd
from sklearn.cluster import KMeans
from sklearn.preprocessing import StandardScaler

# Load dataset
df = pd.read_csv("tourist_behaviour.csv")

# Select relevant features for clustering
selected_features = ['Spending_Budget', 'Trip_Frequency', 'Travel_Duration', 'Preferred_Attractions', 'Online_Bookings']

# Ensure selected features exist in dataset
for feature in selected_features:
    if feature not in df.columns:
        raise KeyError(f"Column '{feature}' not found in dataset!")

X = df[selected_features]

# Convert categorical feature ('Preferred_Attractions') to numeric using one-hot encoding
X = pd.get_dummies(X, columns=['Preferred_Attractions'], drop_first=True)

# Standardize the data
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

# Determine the optimal number of clusters using the Elbow Method
wcss = []
for i in range(1, 11):
    kmeans = KMeans(n_clusters=i, init='k-means++', random_state=42, n_init=10)
    kmeans.fit(X_scaled)
    wcss.append(kmeans.inertia_)

# Train K-Means clustering model
optimal_k = 5  # Choose the number of clusters based on the elbow method
kmeans = KMeans(n_clusters=optimal_k, init='k-means++', random_state=42, n_init=10)
df['Tourist_Segment'] = kmeans.fit_predict(X_scaled)

# Save the clustered data
df.to_csv("tourist_segments.csv", index=False)

# Display first few rows of the clustered data
print(df.head())



   Spending_Budget  Trip_Frequency  Travel_Duration Preferred_Attractions  \
0             8747              10                4               Beaches   
1              634               2               12      Historical Sites   
2             8890               9                8             Mountains   
3             2497               1               25             Mountains   
4             2054               2               25               Museums   

   Online_Bookings  Tourist_Segment  
0               17                1  
1               11                3  
2                9                1  
3               12                4  
4               19                0  
