In [1]:
# =============================== Classification Model with Ensemble ===============================
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.feature_selection import SelectKBest, f_classif
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix, classification_report
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from xgboost import XGBClassifier
import joblib

# --- Load dataset ---
df = pd.read_csv("C:\Users\C Sutharsan\Downloads\GUVI class notes AIML\Capstone_project\Project_4\Final\final_dataset.csv")

# Advanced feature engineering for visit mode prediction
def create_advanced_features(df):
    """Create sophisticated features that better predict visit mode"""
    
    # Create a copy to avoid modifying original
    df_enhanced = df.copy()
    
    # 1. USER BEHAVIOR PATTERNS
    # User's preferred travel seasons
    user_seasonal_preference = df.groupby(['UserId', 'VisitSeason']).size().reset_index(name='count')
    user_dominant_season = user_seasonal_preference.sort_values('count', ascending=False).groupby('UserId').first()
    df_enhanced = df_enhanced.merge(user_dominant_season[['VisitSeason']], left_on='UserId', right_index=True, how='left', suffixes=('', '_preferred'))
    
    # User's visit mode percentages - fixed for pandas warning
    user_visit_mode_freq = df.groupby(['UserId', 'VisitMode_y']).size().reset_index(name='count')
    user_visit_totals = user_visit_mode_freq.groupby('UserId')['count'].sum().reset_index(name='total')
    user_visit_mode_freq = user_visit_mode_freq.merge(user_visit_totals, on='UserId')
    user_visit_mode_freq['pct'] = user_visit_mode_freq['count'] / user_visit_mode_freq['total']
    user_visit_mode_pct = user_visit_mode_freq.pivot(index='UserId', columns='VisitMode_y', values='pct').fillna(0)
    
    for col in user_visit_mode_pct.columns:
        df_enhanced = df_enhanced.merge(user_visit_mode_pct[[col]], left_on='UserId', right_index=True, how='left')
        df_enhanced.rename(columns={col: f'user_pct_{col}'}, inplace=True)
    
    # 2. TEMPORAL PATTERNS
    # Visit mode by month patterns
    monthly_mode_dist = pd.crosstab(df['VisitMonth'], df['VisitMode_y'], normalize='index')
    for mode in monthly_mode_dist.columns:
        month_mode_prob = dict(zip(monthly_mode_dist.index, monthly_mode_dist[mode]))
        df_enhanced[f'month_mode_prob_{mode}'] = df_enhanced['VisitMonth'].map(month_mode_prob)
    
    # Cyclical encoding for months
    df_enhanced['sin_month'] = np.sin(2 * np.pi * df_enhanced['VisitMonth'] / 12)
    df_enhanced['cos_month'] = np.cos(2 * np.pi * df_enhanced['VisitMonth'] / 12)
    
    # 3. GEOGRAPHIC PATTERNS
    # Visit mode by demographic patterns
    continent_mode_dist = pd.crosstab(df['Continent'], df['VisitMode_y'], normalize='index')
    
    for mode in continent_mode_dist.columns:
        continent_mode_prob = dict(zip(continent_mode_dist.index, continent_mode_dist[mode]))
        df_enhanced[f'continent_mode_prob_{mode}'] = df_enhanced['Continent'].map(continent_mode_prob)
    
    # 4. INTERACTION FEATURES
    # User-Attraction compatibility scores
    user_attraction_compatibility = df.groupby(['UserId', 'AttractionType'])['Rating'].mean().reset_index()
    df_enhanced = df_enhanced.merge(
        user_attraction_compatibility.rename(columns={'Rating': 'user_attraction_compatibility'}),
        on=['UserId', 'AttractionType'],
        how='left'
    )
    
    # 5. BEHAVIORAL SEQUENCES
    # Previous visit mode
    df_enhanced = df_enhanced.sort_values(['UserId', 'VisitYear', 'VisitMonth'])
    df_enhanced['prev_visit_mode'] = df_enhanced.groupby('UserId')['VisitMode_y'].shift(1)
    
    # 6. ADVANCED AGGREGATIONS
    # User travel diversity score
    df_enhanced['user_travel_diversity'] = df_enhanced.groupby('UserId')['VisitMode_y'].transform('nunique')
    df_enhanced['user_attraction_diversity'] = df_enhanced.groupby('UserId')['AttractionType'].transform('nunique')
    
    # Fill missing values
    numeric_columns = df_enhanced.select_dtypes(include=[np.number]).columns
    for col in numeric_columns:
        df_enhanced[col] = df_enhanced[col].fillna(df_enhanced[col].mean())
    
    return df_enhanced

# Apply enhanced feature engineering
df_enhanced = create_advanced_features(df)

# Calculate month-mode probabilities
monthly_mode_dist = pd.crosstab(df_enhanced['VisitMonth'], df_enhanced['VisitMode_y'], normalize='index')

# Convert to nested dict: {mode: {month: prob, ...}, ...}
month_mode_probs = {mode: monthly_mode_dist[mode].to_dict() for mode in monthly_mode_dist.columns}

# Calculate continent-mode probabilities
continent_mode_dist = pd.crosstab(df_enhanced['Continent'], df_enhanced['VisitMode_y'], normalize='index')

# Convert to nested dict: {mode: {continent: prob, ...}, ...}
continent_mode_probs = {mode: continent_mode_dist[mode].to_dict() for mode in continent_mode_dist.columns}

# Define features
numerical_features = [
    'VisitMonth', 'VisitQuarter', 'VisitYear',
    'continent_mode_prob_Business', 'continent_mode_prob_Couples',
    'user_pct_Couples', 'user_pct_Family', 'user_pct_Friends', 'user_pct_Business',
    'user_travel_diversity', 'attraction_avg_rating_before', 'user_previous_visits',
    'city_popularity', 'user_avg_rating_before', 'user_attraction_compatibility',
    'sin_month', 'cos_month', 'month_mode_prob_Business', 'month_mode_prob_Family',
    'month_mode_prob_Friends', 'month_mode_prob_Couples'
]
categorical_features = [
    'VisitSeason', 'Continent', 'Region', 'Country', 'CityName',
    'AttractionType', 'prev_visit_mode'
]

# Define target variable
target = 'VisitMode_y'

In [2]:
# Drop rows with missing target
df_enhanced = df_enhanced.dropna(subset=[target])

# Encode target
le_target = LabelEncoder()
y = le_target.fit_transform(df_enhanced[target])

# Select features (make a copy to avoid SettingWithCopyWarning)
X = df_enhanced[numerical_features + categorical_features].copy()

# Encode categorical features
label_encoders = {}
for col in categorical_features:
    le = LabelEncoder()
    X.loc[:, col] = X[col].astype(str).fillna("missing")
    X.loc[:, col] = le.fit_transform(X[col])
    label_encoders[col] = le

# Identify numeric and categorical for preprocessing
numeric_feats = [col for col in X.columns if col in numerical_features]
categorical_feats = [col for col in X.columns if col in categorical_features]

# Preprocessing pipeline
numeric_transformer = Pipeline(steps=[
    ("imputer", SimpleImputer(strategy="median")),
    ("scaler", StandardScaler())
])

preprocessor = ColumnTransformer(
    transformers=[
        ("num", numeric_transformer, numeric_feats)
    ],
    remainder="passthrough"
)

# Train/test split
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

# Models (removed use_label_encoder to avoid warning)
models = {
    "XGBoost": XGBClassifier(n_estimators=200, random_state=42, eval_metric="mlogloss")
}

# Train and evaluate
predictions = {}
for name, model in models.items():
    print(f"\nTraining {name}...")
    pipe = Pipeline([
        ("preprocessor", preprocessor),
        ("model", model)
    ])
    pipe.fit(X_train, y_train)
    preds = pipe.predict(X_test)
    predictions[name] = preds
    print(f"{name} Performance:")
    print("Accuracy:", accuracy_score(y_test, preds))
    print("Precision (macro):", precision_score(y_test, preds, average="macro"))
    print("Recall (macro):", recall_score(y_test, preds, average="macro"))
    print("F1 Score (macro):", f1_score(y_test, preds, average="macro"))
    print("\nClassification Report:\n", classification_report(y_test, preds))
    print("Confusion Matrix:\n", confusion_matrix(y_test, preds))
    joblib.dump(pipe, f"{name}_advanced_classifier.pkl")

# Ensemble voting (majority)
from scipy.stats import mode
pred_array = np.column_stack(list(predictions.values()))
ensemble_preds = mode(pred_array, axis=1)[0].flatten()

print("\nEnsemble Voting Performance:")
print("Accuracy:", accuracy_score(y_test, ensemble_preds))
print("Precision (macro):", precision_score(y_test, ensemble_preds, average="macro"))
print("Recall (macro):", recall_score(y_test, ensemble_preds, average="macro"))
print("F1 Score (macro):", f1_score(y_test, ensemble_preds, average="macro"))

# Save encoders
joblib.dump(label_encoders, "feature_label_encoders.pkl")
joblib.dump(le_target, "target_label_encoder.pkl")
print("Models and encoders saved successfully.")


Training XGBoost...
XGBoost Performance:
Accuracy: 0.9237239313284941
Precision (macro): 0.8910913137657148
Recall (macro): 0.8858583495439636
F1 Score (macro): 0.8882521402236871

Classification Report:
               precision    recall  f1-score   support

           0       0.78      0.79      0.78       117
           1       0.93      0.95      0.94      3490
           2       0.94      0.93      0.93      2461
           3       0.90      0.90      0.90      1851
           4       0.91      0.85      0.88       760

    accuracy                           0.92      8679
   macro avg       0.89      0.89      0.89      8679
weighted avg       0.92      0.92      0.92      8679

Confusion Matrix:
 [[  93    5    7   10    2]
 [   3 3311   74   79   23]
 [   6   88 2296   53   18]
 [  13   94   50 1671   23]
 [   5   44   27   38  646]]

Ensemble Voting Performance:
Accuracy: 0.9237239313284941
Precision (macro): 0.8910913137657148
Recall (macro): 0.8858583495439636
F1 Score (mac