In [6]:
import pandas as pd
import numpy as np
from pathlib import Path
from sklearn.preprocessing import LabelEncoder
import joblib
import sys

# --- CONFIG ---
DATA_FILENAME = "ecotourism_dataset.csv"
PROCESSED_DATA_PATH = "processed_data.csv"
MODEL_DIR = Path("models")
MODEL_DIR.mkdir(exist_ok=True)

# --- SMART PATH DETECTION ---
# We check multiple locations to find where the file is hiding
possible_paths = [
    Path(DATA_FILENAME),              # Current directory
    Path("data") / DATA_FILENAME,     # data/ folder
    Path("../data") / DATA_FILENAME,  # Parent data folder
    Path("notebooks/data") / DATA_FILENAME # Notebooks data folder
]

DATA_FILE = None
for path in possible_paths:
    if path.exists():
        DATA_FILE = path
        print(f"‚úÖ Found data at: {path.resolve()}")
        break

if DATA_FILE is None:
    print("‚ùå Error: Could not find 'ecotourism_dataset.csv'.")
    print("   Please make sure you have downloaded the file and placed it in your project folder.")
    print(f"   Checked locations: {[p.name for p in possible_paths]}")
    sys.exit()

# --- LOAD DATA ---
print("üîÑ Loading Ecotourism Dataset...")
df = pd.read_csv(DATA_FILE)

# --- CLEANING ---
# We will predict 'Sentiment_Label' based on user features
target_col = "Sentiment_Label"
feature_cols = [
    "Visitor_Age", "Visit_Type", "Travel_Purpose", 
    "Eco_Rating", "Service_Quality", "Crowd_Level", 
    "Expense_Level", "Eco_Activity_Count"
]

# Drop rows with missing values in critical columns
df = df.dropna(subset=feature_cols + [target_col])

# --- ENCODING ---
encoders = {}
for col in ["Visit_Type", "Travel_Purpose", "Crowd_Level", "Expense_Level"]:
    le = LabelEncoder()
    df[col] = le.fit_transform(df[col])
    encoders[col] = le

# Encode Target
target_le = LabelEncoder()
df[target_col] = target_le.fit_transform(df[target_col])
encoders["target"] = target_le

# Save Encoders for the App
joblib.dump(encoders, MODEL_DIR / "encoders.pkl")
print("‚úÖ Encoders saved.")

# Save Processed Data
final_df = df[feature_cols + [target_col]]
final_df.to_csv(PROCESSED_DATA_PATH, index=False)
print(f"‚úÖ Processed data saved to {PROCESSED_DATA_PATH}")
print(f"   Features: {feature_cols}")
print(f"   Target: {target_col}")

‚úÖ Found data at: D:\Projects\Projects Collection\Final Year Project\Charoo\AI Powered Guide for Sustainabe Tourism Prediction\data\ecotourism_dataset.csv
üîÑ Loading Ecotourism Dataset...
‚úÖ Encoders saved.
‚úÖ Processed data saved to processed_data.csv
   Features: ['Visitor_Age', 'Visit_Type', 'Travel_Purpose', 'Eco_Rating', 'Service_Quality', 'Crowd_Level', 'Expense_Level', 'Eco_Activity_Count']
   Target: Sentiment_Label


In [7]:
import pandas as pd
import numpy as np
from pathlib import Path
from sklearn.model_selection import train_test_split, cross_val_score, StratifiedKFold
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, confusion_matrix
import joblib
import sys

# --- CONFIG ---
DATA_FILENAME = "processed_data.csv"
MODEL_DIR = Path("models")
MODEL_DIR.mkdir(exist_ok=True)

# --- PATH DETECTION ---
if Path(DATA_FILENAME).exists():
    DATA_PATH = Path(DATA_FILENAME)
else:
    print("‚ùå Processed data not found. Run 01_process_data.py first.")
    sys.exit()

# --- LOAD ---
df = pd.read_csv(DATA_PATH)
X = df.drop(columns=["Sentiment_Label"])
y = df["Sentiment_Label"]

# Split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# --- TRAINING WITH CROSS-VALIDATION ---
print("ü§ñ Training & Evaluating models...")

# 5-Fold Cross Validation
cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

# Model 1: Random Forest
rf = RandomForestClassifier(n_estimators=200, random_state=42)
rf_cv_scores = cross_val_score(rf, X_train, y_train, cv=cv, scoring='accuracy')
rf.fit(X_train, y_train)
rf_test_acc = accuracy_score(y_test, rf.predict(X_test))
print(f"   Random Forest CV Mean: {rf_cv_scores.mean():.2%} | Test Acc: {rf_test_acc:.2%}")

# Model 2: Logistic Regression
lr = LogisticRegression(max_iter=1000, random_state=42)
lr_cv_scores = cross_val_score(lr, X_train, y_train, cv=cv, scoring='accuracy')
lr.fit(X_train, y_train)
lr_test_acc = accuracy_score(y_test, lr.predict(X_test))
print(f"   Logistic Regression CV Mean: {lr_cv_scores.mean():.2%} | Test Acc: {lr_test_acc:.2%}")

# --- SELECTION (Based on Test Accuracy) ---
if rf_test_acc >= lr_test_acc:
    best_model = rf
    best_name = "Random Forest"
    best_acc = rf_test_acc
else:
    best_model = lr
    best_name = "Logistic Regression"
    best_acc = lr_test_acc

print(f"üèÜ Best Model Selected: {best_name}")

# --- SAVE ARTIFACTS ---
joblib.dump(best_model, MODEL_DIR / "best_model.pkl")

# Save Metrics for App
metrics = {
    "accuracy": best_acc,
    "rf_acc": rf_test_acc,
    "lr_acc": lr_test_acc,
    "rf_cv": rf_cv_scores.mean(),
    "lr_cv": lr_cv_scores.mean(),
    "best_model": best_name,
    "confusion_matrix": confusion_matrix(y_test, best_model.predict(X_test)).tolist()
}
joblib.dump(metrics, MODEL_DIR / "model_metrics.pkl")
print("‚úÖ Models and Cross-Validation metrics saved.")

ü§ñ Training & Evaluating models...
   Random Forest CV Mean: 33.49% | Test Acc: 35.83%
   Logistic Regression CV Mean: 35.99% | Test Acc: 36.46%
üèÜ Best Model Selected: Logistic Regression
‚úÖ Models and Cross-Validation metrics saved.
