# Travel Guide Places - Machine Learning Analysis

This notebook demonstrates how to use the CSV data exported from your Travel Guide application for machine learning analysis.

## Setup

First, make sure you have downloaded the CSV file from your Travel Guide admin panel.

In [None]:
# Install required packages
%pip install pandas numpy scikit-learn matplotlib seaborn plotly folium

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier, RandomForestRegressor
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.metrics import classification_report, mean_squared_error
import warnings
warnings.filterwarnings('ignore')

# Set plotting style
plt.style.use('seaborn-v0_8')
sns.set_palette("husl")

## Load and Explore Data

Load the CSV file downloaded from your Travel Guide application.

In [None]:
# Load the data - update the path to your downloaded CSV file
df = pd.read_csv('places_ml_ready.csv')  # or 'places.csv'

print("Dataset shape:", df.shape)
print("\nColumns:", df.columns.tolist())
df.head()

In [None]:
# Data overview
print("Data Info:")
df.info()
print("\nMissing values:")
print(df.isnull().sum())
print("\nDescriptive statistics:")
df.describe()

## Data Visualization and Analysis

In [None]:
# Category analysis
fig, axes = plt.subplots(2, 2, figsize=(15, 12))

# Category distribution
df['category'].value_counts().plot(kind='bar', ax=axes[0,0])
axes[0,0].set_title('Places by Category')
axes[0,0].tick_params(axis='x', rotation=45)

# Region distribution
df['region'].value_counts().plot(kind='pie', ax=axes[0,1], autopct='%1.1f%%')
axes[0,1].set_title('Places by Region')

# Visit time distribution
df['estimated_time_to_visit'].hist(bins=20, ax=axes[1,0])
axes[1,0].set_title('Distribution of Visit Times')
axes[1,0].set_xlabel('Hours')

# Average visit time by category
avg_time = df.groupby('category')['estimated_time_to_visit'].mean().sort_values()
avg_time.plot(kind='barh', ax=axes[1,1])
axes[1,1].set_title('Average Visit Time by Category')

plt.tight_layout()
plt.show()

In [None]:
# Geographic visualization (if coordinates are available)
if 'latitude' in df.columns and 'longitude' in df.columns:
    valid_coords = df[(df['latitude'] != 0) & (df['longitude'] != 0)]
    
    if len(valid_coords) > 0:
        plt.figure(figsize=(12, 8))
        scatter = plt.scatter(valid_coords['longitude'], valid_coords['latitude'], 
                            c=valid_coords['estimated_time_to_visit'], 
                            cmap='viridis', alpha=0.7, s=100)
        plt.colorbar(scatter, label='Visit Time (hours)')
        plt.xlabel('Longitude')
        plt.ylabel('Latitude')
        plt.title('Places by Geographic Location (colored by visit time)')
        plt.grid(True, alpha=0.3)
        plt.show()
        
        print(f"Found {len(valid_coords)} places with valid coordinates")
    else:
        print("No valid coordinates found in the dataset")
else:
    print("No coordinate columns found")

## Machine Learning: Predicting Visit Time

In [None]:
# Prepare features for ML
# Create additional features if not present in ML-ready CSV
if 'name_length' not in df.columns:
    df['name_length'] = df['name'].str.len()
    df['description_length'] = df['description'].str.len()
    df['description_word_count'] = df['description'].str.split().str.len()

# Encode categorical variables
le = LabelEncoder()
if 'category_encoded' not in df.columns:
    df['category_encoded'] = le.fit_transform(df['category'].fillna('unknown'))

# Select features for the model
feature_columns = ['name_length', 'description_length', 'description_word_count']

# Add encoded features if available
if 'category_encoded' in df.columns:
    feature_columns.append('category_encoded')
if 'region_encoded' in df.columns:
    feature_columns.append('region_encoded')
if 'popularity_score' in df.columns:
    feature_columns.append('popularity_score')

# Add coordinates if available and valid
if 'latitude' in df.columns and df['latitude'].sum() != 0:
    feature_columns.extend(['latitude', 'longitude'])

print("Features selected for ML model:", feature_columns)

In [None]:
# Prepare data for training
X = df[feature_columns].fillna(0)
y = df['estimated_time_to_visit'].fillna(df['estimated_time_to_visit'].median())

# Split the data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Scale the features
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

print(f"Training set size: {X_train.shape[0]}")
print(f"Test set size: {X_test.shape[0]}")

In [None]:
# Train Random Forest model
rf_model = RandomForestRegressor(n_estimators=100, random_state=42)
rf_model.fit(X_train_scaled, y_train)

# Make predictions
y_pred = rf_model.predict(X_test_scaled)

# Evaluate the model
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)

print(f"Model Performance:")
print(f"RMSE: {rmse:.2f} hours")
print(f"Mean actual visit time: {y_test.mean():.2f} hours")
print(f"Model R² score: {rf_model.score(X_test_scaled, y_test):.3f}")

In [None]:
# Feature importance analysis
feature_importance = pd.DataFrame({
    'feature': feature_columns,
    'importance': rf_model.feature_importances_
}).sort_values('importance', ascending=False)

plt.figure(figsize=(12, 6))

# Plot feature importance
plt.subplot(1, 2, 1)
sns.barplot(data=feature_importance, y='feature', x='importance')
plt.title('Feature Importance for Visit Time Prediction')
plt.xlabel('Importance')

# Plot actual vs predicted
plt.subplot(1, 2, 2)
plt.scatter(y_test, y_pred, alpha=0.7)
plt.plot([y_test.min(), y_test.max()], [y_test.min(), y_test.max()], 'r--', lw=2)
plt.xlabel('Actual Visit Time (hours)')
plt.ylabel('Predicted Visit Time (hours)')
plt.title('Actual vs Predicted Visit Time')
plt.grid(True, alpha=0.3)

plt.tight_layout()
plt.show()

print("\nTop 5 Most Important Features:")
print(feature_importance.head())

## Recommendation System

In [None]:
def recommend_places(df, user_preferences=None, top_n=10):
    """
    Recommend places based on user preferences
    """
    if user_preferences is None:
        user_preferences = {
            'preferred_time': 3.0,  # hours
            'preferred_categories': ['historical', 'religious'],
            'avoid_long_descriptions': True
        }
    
    recommendations = df.copy()
    
    # Time preference score
    time_diff = abs(recommendations['estimated_time_to_visit'] - user_preferences['preferred_time'])
    recommendations['time_score'] = 1 / (1 + time_diff)
    
    # Category preference score
    recommendations['category_score'] = recommendations['category'].apply(
        lambda x: 1.0 if x in user_preferences['preferred_categories'] else 0.3
    )
    
    # Description length preference (some users prefer concise descriptions)
    if user_preferences.get('avoid_long_descriptions', False):
        max_desc_length = recommendations['description_length'].quantile(0.7)
        recommendations['desc_score'] = recommendations['description_length'].apply(
            lambda x: 1.0 if x <= max_desc_length else 0.5
        )
    else:
        recommendations['desc_score'] = 1.0
    
    # Popularity score (if available)
    if 'popularity_score' in recommendations.columns:
        pop_score = recommendations['popularity_score'] / recommendations['popularity_score'].max()
    else:
        pop_score = 0.5
    
    # Calculate overall recommendation score
    recommendations['recommendation_score'] = (
        recommendations['time_score'] * 0.3 +
        recommendations['category_score'] * 0.4 +
        recommendations['desc_score'] * 0.1 +
        pop_score * 0.2
    )
    
    # Sort by score and return top N
    top_recommendations = recommendations.nlargest(top_n, 'recommendation_score')
    
    return top_recommendations[['name', 'category', 'region', 'estimated_time_to_visit', 'recommendation_score']]

# Generate recommendations
user_prefs = {
    'preferred_time': 2.5,
    'preferred_categories': ['historical', 'religious', 'natural'],
    'avoid_long_descriptions': True
}

recommendations = recommend_places(df, user_prefs, top_n=10)
print("Top 10 Recommended Places:")
print(recommendations.round(3))

## Advanced Analysis: Clustering Places

In [None]:
from sklearn.cluster import KMeans
from sklearn.decomposition import PCA

# Prepare data for clustering
cluster_features = ['estimated_time_to_visit', 'name_length', 'description_length']
if 'category_encoded' in df.columns:
    cluster_features.append('category_encoded')
if 'popularity_score' in df.columns:
    cluster_features.append('popularity_score')

X_cluster = df[cluster_features].fillna(0)
X_cluster_scaled = StandardScaler().fit_transform(X_cluster)

# Apply K-means clustering
kmeans = KMeans(n_clusters=4, random_state=42)
df['cluster'] = kmeans.fit_predict(X_cluster_scaled)

# Visualize clusters using PCA
pca = PCA(n_components=2)
X_pca = pca.fit_transform(X_cluster_scaled)

plt.figure(figsize=(12, 5))

# Plot clusters
plt.subplot(1, 2, 1)
scatter = plt.scatter(X_pca[:, 0], X_pca[:, 1], c=df['cluster'], cmap='viridis', alpha=0.7)
plt.colorbar(scatter)
plt.xlabel(f'First Principal Component ({pca.explained_variance_ratio_[0]:.1%} variance)')
plt.ylabel(f'Second Principal Component ({pca.explained_variance_ratio_[1]:.1%} variance)')
plt.title('Place Clusters (PCA Visualization)')

# Cluster characteristics
plt.subplot(1, 2, 2)
cluster_means = df.groupby('cluster')['estimated_time_to_visit'].mean()
cluster_means.plot(kind='bar')
plt.title('Average Visit Time by Cluster')
plt.xlabel('Cluster')
plt.ylabel('Average Visit Time (hours)')
plt.xticks(rotation=0)

plt.tight_layout()
plt.show()

# Analyze clusters
print("Cluster Analysis:")
for cluster_id in sorted(df['cluster'].unique()):
    cluster_data = df[df['cluster'] == cluster_id]
    print(f"\nCluster {cluster_id}:")
    print(f"  Size: {len(cluster_data)} places")
    print(f"  Avg visit time: {cluster_data['estimated_time_to_visit'].mean():.2f} hours")
    print(f"  Most common category: {cluster_data['category'].mode().iloc[0] if len(cluster_data['category'].mode()) > 0 else 'N/A'}")
    print(f"  Example places: {', '.join(cluster_data['name'].head(3).tolist())}")

## Export Results for Further Use

In [None]:
# Save enhanced dataset with ML features and clusters
output_df = df.copy()
output_df['predicted_visit_time'] = rf_model.predict(scaler.transform(X))

# Save to CSV
output_df.to_csv('travel_places_with_ml_insights.csv', index=False)
print("Enhanced dataset saved as 'travel_places_with_ml_insights.csv'")

# Save recommendations
recommendations.to_csv('place_recommendations.csv', index=False)
print("Recommendations saved as 'place_recommendations.csv'")

print("\n=== Analysis Complete! ===")
print("You now have:")
print("1. A trained model for predicting visit times")
print("2. Place clusters for segmentation")
print("3. A recommendation system")
print("4. Enhanced dataset with ML insights")