## Milestone 3 - Predictive Modeling



In [24]:
#Importing libraries

import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
import joblib

In [25]:
# Loading feature engineered dataset created on Milestone 2
df = pd.read_csv("visa_dataset_feature_engineered.csv")

# Define target and features
y = df["Processing Time (Days)"]
X = df.drop(columns=["Processing Time (Days)", "Visa Status"])

# Train-test split
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)

In [26]:
# Loading preprocessor from Milestone 1
preprocessor = joblib.load("visa_preprocessor.pkl")

# Transform data
X_train_trans = preprocessor.transform(X_train)
X_test_trans = preprocessor.transform(X_test)

In [27]:
# Baseline Model
models = {
    "Linear Regression": LinearRegression(),
    "Random Forest": RandomForestRegressor(random_state=42),
    "Gradient Boosting": GradientBoostingRegressor(random_state=42)
}

In [28]:
results = {}
def evaluate_model(name, model, X_train, y_train, X_test, y_test):
    model.fit(X_train, y_train)
    preds = model.predict(X_test)

    mae = mean_absolute_error(y_test, preds)
    rmse = np.sqrt(mean_squared_error(y_test, preds))
    r2 = r2_score(y_test, preds)

    results[name] = [mae, rmse, r2]
    print(f"\n{name}")
    print("MAE :", mae)
    print("RMSE:", rmse)
    print("R² Score:", r2)

In [29]:
# Train and evaluate baseline models
for name, model in models.items():
    evaluate_model(name, model, X_train_trans, y_train, X_test_trans, y_test)


Linear Regression
MAE : 4.242649988501375
RMSE: 6.312244619290182
R² Score: 0.9233622847108139

Random Forest
MAE : 4.382071
RMSE: 6.429956412760509
R² Score: 0.9204773277610148

Gradient Boosting
MAE : 4.362385629867506
RMSE: 6.372905871784863
R² Score: 0.9218822159082087


### Model Selection Summary

Based on the evaluation metrics, **Linear Regression is the best-performing model**.

#### Performance Comparison
| Model | MAE (↓) | RMSE (↓) | R² (↑) |
|-------|---------|-----------|---------|
| **Linear Regression** | **4.24** | **6.31** | **0.92336** |
| Gradient Boosting | 4.36 | 6.37 | 0.92188 |
| Random Forest | 4.38 | 6.43 | 0.92048 |

#### Why Linear Regression Wins
- Lowest **MAE** → most accurate on average.  
- Lowest **RMSE** → fewer large errors.  
- Highest **R²** → explains the most variance in processing time.  
- Features created during preprocessing show **strong linear relationships**, making linear regression ideal.  
- Ensemble models slightly overfit, while linear regression generalizes better.

#### Conclusion
**Linear Regression is selected as the final model for deployment** due to its superior accuracy, stability, and interpretability.


Tuning RandomForest using RandomizedSearchCV





In [30]:
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import RandomizedSearchCV

rf = RandomForestRegressor()

param_dist = {
    'n_estimators': [50, 100, 150],
    'max_depth': [5, 10, 15],
    'min_samples_split': [2, 5],
    'min_samples_leaf': [1, 2],
    'max_features': ['sqrt']
}

search = RandomizedSearchCV(
    estimator=rf,
    param_distributions=param_dist,
    n_iter=6,
    cv=2,
    scoring='neg_mean_absolute_error',
    n_jobs=-1,
    verbose=2
)

# USE TRANSFORMED DATA
search.fit(X_train_trans, y_train)

print("Best Parameters:", search.best_params_)
print("Best MAE:", -search.best_score_)


Fitting 2 folds for each of 6 candidates, totalling 12 fits
Best Parameters: {'n_estimators': 150, 'min_samples_split': 5, 'min_samples_leaf': 1, 'max_features': 'sqrt', 'max_depth': 15}
Best MAE: 4.8799297534944035


Tuning GradientBoost using RandomizedSearchCV





In [31]:
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.model_selection import RandomizedSearchCV
from sklearn.metrics import mean_absolute_error
import numpy as np

# Use transformed data
X_train_gb = X_train_trans
X_test_gb = X_test_trans

gb = GradientBoostingRegressor(random_state=42)

param_dist_gb = {
    'n_estimators': [50, 100, 150],
    'learning_rate': [0.01, 0.05, 0.1],
    'max_depth': [2, 3, 4],
    'min_samples_split': [2, 5],
    'min_samples_leaf': [1, 2]
}

gb_search = RandomizedSearchCV(
    estimator=gb,
    param_distributions=param_dist_gb,
    n_iter=8,
    cv=2,
    scoring='neg_mean_absolute_error',
    n_jobs=-1,
    verbose=2,
    random_state=42
)

gb_search.fit(X_train_gb, y_train)

print("Best Gradient Boosting Parameters:", gb_search.best_params_)
print("Best MAE:", -gb_search.best_score_)

# Evaluate on test set
gb_best = gb_search.best_estimator_
gb_preds = gb_best.predict(X_test_gb)

test_mae = mean_absolute_error(y_test, gb_preds)
test_rmse = np.sqrt(np.mean((y_test - gb_preds)**2))
test_r2 = gb_best.score(X_test_gb, y_test)

print("\n===== Tuned Gradient Boosting Test Performance =====")
print("MAE :", test_mae)
print("RMSE:", test_rmse)
print("R² Score:", test_r2)


Fitting 2 folds for each of 8 candidates, totalling 16 fits
Best Gradient Boosting Parameters: {'n_estimators': 150, 'min_samples_split': 5, 'min_samples_leaf': 1, 'max_depth': 2, 'learning_rate': 0.1}
Best MAE: 4.421893233679947

===== Tuned Gradient Boosting Test Performance =====
MAE : 4.393291260650672
RMSE: 6.389805366296463
R² Score: 0.9214673654584936


Based on both baseline and hyperparameter-tuned results, Linear Regression is the best-performing model for the visa processing time prediction task.


• It achieves the lowest MAE (4.24), lowest RMSE (6.31), and the highest R² score (0.9233).  
• Random Forest performance worsened after tuning (MAE ≈ 4.85), indicating that tree-based methods are not well-suited to this dataset.  
• Tuned Gradient Boosting improved but still did not outperform Linear Regression (MAE ≈ 4.39, R² ≈ 0.9214).  


These results show that the dataset relationships are mostly linear, and adding model complexity does not improve performance.  


Thus, **Linear Regression is selected as the final model for deployment.**


In [32]:
# Select best model
best_model_name = min(results, key=results.get)
best_model = models[best_model_name]

print("\nBest Model Based on MAE:", best_model_name)


Best Model Based on MAE: Linear Regression


In [33]:
joblib.dump(best_model, "best_regression_model.pkl")
print("\nSaved: best_regression_model.pkl")



Saved: best_regression_model.pkl


### Predicting in the test set

In [35]:
# Take 1 sample from test set
sample = X_test.iloc[[0]]
sample_trans = preprocessor.transform(sample)

prediction = best_model.predict(sample_trans)[0]
prediction = max(0, prediction)

print("\nSample Prediction (Days):", prediction)


Sample Prediction (Days): 16.80235368849185


### Predicting by giving manual input

In [45]:
# HELPER FUNCTIONS

def compute_date_features(date_str):
    d = pd.to_datetime(date_str)
    return d.month, d.weekday(), d.isocalendar().week

def get_season_index(season):
    mapping = {"Low": 1, "Mid": 2, "Off-Peak": 3, "Peak": 4}
    return mapping.get(season, 2)    # default mid

def get_country_avg(country):
    return df[df["Applicant Nationality"] == country]["Processing Time (Days)"].mean()

def get_visa_avg(vtype):
    return df[df["Visa Type"] == vtype]["Processing Time (Days)"].mean()

def get_center_load(center):
    return len(df[df["Processing Center"] == center])

In [42]:
# Prediction function
def predict_processing_time(
    application_date,
    decision_date,
    visa_type,
    nationality,
    center,
    season,
    complexity,
    completeness,
    expedited
):

    # Generate date-related features
    app_month, app_day, app_week = compute_date_features(application_date)
    dec_month, dec_day, dec_week = compute_date_features(decision_date)

    # Generate engineered features
    season_idx = get_season_index(season)
    country_avg = get_country_avg(nationality)
    visa_avg = get_visa_avg(visa_type)
    center_load = get_center_load(center)

    # Build final row for prediction
    input_df = pd.DataFrame([{
        "Application Date": application_date,
        "Decision Date": decision_date,
        "Visa Type": visa_type,
        "Applicant Nationality": nationality,
        "Processing Center": center,
        "Season": season,
        "Application Complexity": complexity,
        "Document Completeness": completeness,
        "Expedited Request": expedited,
        "Application_Month": app_month,
        "Application_DayOfWeek": app_day,
        "Application_WeekOfYear": app_week,
        "Decision_Month": dec_month,
        "Decision_DayOfWeek": dec_day,
        "Decision_WeekOfYear": dec_week,
        "Season_Index": season_idx,
        "Country_Avg_Processing": country_avg,
        "VisaType_Avg_Processing": visa_avg,
        "Center_Load": center_load
    }])

    # Transform with preprocessor
    transformed = preprocessor.transform(input_df)

    # Predict
    prediction = best_model.predict(transformed)[0]
    return max(0, prediction)

In [44]:
#Sample Prediction
result = predict_processing_time(
    application_date="2024-01-21",
    decision_date="2024-03-10",
    visa_type="Business",
    nationality="India",
    center="Delhi",
    season="Peak",
    complexity=0,
    completeness=1,
    expedited=0
)

print("Predicted Processing Time (Days):", result)


Predicted Processing Time (Days): 44.10037460740214
