In [4]:

# %%
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from imblearn.over_sampling import SMOTE
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.compose import ColumnTransformer

# Load dataset
visa_df = pd.read_csv("EasyVisa.csv")  # Update with your dataset path

# Clean data
visa_clean = visa_df.copy()
visa_clean = visa_clean.drop(columns=['case_id', 'requires_job_training'])

# Handle categorical features
continent_counts = visa_clean['continent'].value_counts(normalize=True)
less_freq = continent_counts[continent_counts < 0.05].index
visa_clean['continent'] = visa_clean['continent'].replace(less_freq, 'Others')

# Define features and target
X = visa_clean.drop('case_status', axis=1)
y = visa_clean['case_status'].map({'Certified':1, 'Denied':0})

# Split data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Preserve feature names
feature_names = X_train.columns.tolist()

# Preprocessing pipeline
categorical_features = ['continent', 'education_of_employee', 'region_of_employment', 'unit_of_wage']
numerical_features = [f for f in feature_names if f not in categorical_features]

preprocessor = ColumnTransformer(
    transformers=[
        ('num', StandardScaler(), numerical_features),
        ('cat', OneHotEncoder(handle_unknown='ignore'), categorical_features)
    ])

# Apply preprocessing
X_train_processed = preprocessor.fit_transform(X_train)
X_test_processed = preprocessor.transform(X_test)

# Handle class imbalance
smote = SMOTE(random_state=42)
X_res, y_res = smote.fit_resample(X_train_processed, y_train)

# %% [markdown]
# ## 2. Model Training & Evaluation

# %%
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from xgboost import XGBClassifier
from catboost import CatBoostClassifier
from sklearn.metrics import classification_report
import joblib

# Initialize models
models = {
    "Random Forest": RandomForestClassifier(n_estimators=200, max_depth=10, random_state=42),
    "Gradient Boosting": GradientBoostingClassifier(n_estimators=150, learning_rate=0.1, max_depth=5, random_state=42),
    "XGBoost": XGBClassifier(n_estimators=150, learning_rate=0.1, max_depth=5, random_state=42),
    "CatBoost": CatBoostClassifier(iterations=200, depth=6, learning_rate=0.1, verbose=False)
}

# Train and evaluate models
best_score = 0
best_model = None

for name, model in models.items():
    model.fit(X_res, y_res)
    y_pred = model.predict(X_test_processed)
    
    print(f"=== {name} ===")
    print(classification_report(y_test, y_pred))
    
    accuracy = accuracy_score(y_test, y_pred)
    if accuracy > best_score:
        best_score = accuracy
        best_model = model

# Save best model
joblib.dump(best_model, 'best_visa_classifier.pkl')
print(f"\nBest Model: {type(best_model).__name__} with Accuracy: {best_score:.4f}")

# Save preprocessing pipeline
joblib.dump(preprocessor, 'preprocessor.pkl')

# %% [markdown]
# ## 3. Data Drift Monitoring

# %%
from evidently import ColumnMapping
from evidently.dashboard import Dashboard
from evidently.dashboard.tabs import DataDriftTab

# Prepare column mapping
column_mapping = ColumnMapping(
    numerical_features=numerical_features,
    categorical_features=categorical_features,
    target='case_status',
    prediction='prediction'
)

# Generate reference and current datasets
reference_data = visa_clean[:2000]
current_data = visa_clean[2000:]

# Create and save drift dashboard
drift_dashboard = Dashboard(tabs=[DataDriftTab()])
drift_dashboard.calculate(reference_data, current_data, column_mapping=column_mapping)
drift_dashboard.save("visa_data_drift.html")

# %% [markdown]
# ## 4. Model Deployment Package

# %%
# Create deployment artifacts
deployment_package = {
    "model": best_model,
    "preprocessor": preprocessor,
    "feature_names": feature_names,
    "column_mapping": column_mapping,
    "model_metadata": {
        "algorithm": type(best_model).__name__,
        "accuracy": best_score,
        "features_used": feature_names,
        "drift_report": "visa_data_drift.html"
    }
}

joblib.dump(deployment_package, 'visa_deployment_package.pkl')


ValueError: could not convert string to float: 'Y'