#**Milestone 3 (Weeks 5â€“6): Predictive Modeling**

In [2]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


##Setup & Data Preparation

In [3]:
import pandas as pd
import numpy as np
import joblib  # For saving the models
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.ensemble import RandomForestRegressor, RandomForestClassifier
from sklearn.metrics import mean_absolute_error, accuracy_score, classification_report
import os

# 1. Load Data
input_file = '/content/drive/MyDrive/Datasets/Enhanced_Visa_Dataset.csv'

if os.path.exists(input_file):
    df = pd.read_csv(input_file)
    print(f"Data Loaded. Shape: {df.shape}")
else:
    print(f"Error: {input_file} not found. Please run Milestone 2.")

# 2. Preprocessing (Cleaning up for AI)
# We need to convert text categories (Education, Skill Level) into numbers
le_education = LabelEncoder()
le_skill = LabelEncoder()
le_status = LabelEncoder() # For the Target variable (Certified/Denied)

# Fill missing just in case
df['MINIMUM_EDUCATION'] = df['MINIMUM_EDUCATION'].fillna('Unknown')
df['PW_SKILL_LEVEL'] = df['PW_SKILL_LEVEL'].fillna('Unknown')

# Encode
df['EDUCATION_ENC'] = le_education.fit_transform(df['MINIMUM_EDUCATION'])
df['SKILL_ENC'] = le_skill.fit_transform(df['PW_SKILL_LEVEL'])

# Encode Target for Classification (Certified=0, Denied=1 usually, check mapping)
df['STATUS_ENC'] = le_status.fit_transform(df['CASE_STATUS'])
print(f"Status Mapping: {dict(zip(le_status.classes_, le_status.transform(le_status.classes_)))}")

# 3. Define Features (X)
# We use the smart features we created in Milestone 2
feature_cols = [
    'PW_WAGE',
    'IS_PEAK_SEASON',
    'STATE_AVG_WAIT',
    'JOB_GROUP_AVG_WAIT',
    'EDUCATION_ENC',
    'SKILL_ENC'
]

X = df[feature_cols]
print("\nFeature Matrix (X) prepared.")
display(X.head())

Data Loaded. Shape: (1171, 16)
Status Mapping: {'Certified': np.int64(0), 'Denied': np.int64(1), 'Withdrawn': np.int64(2)}

Feature Matrix (X) prepared.


Unnamed: 0,PW_WAGE,IS_PEAK_SEASON,STATE_AVG_WAIT,JOB_GROUP_AVG_WAIT,EDUCATION_ENC,SKILL_ENC
0,44490.0,0,749.0625,697.827586,2,0
1,125986.0,1,705.461538,747.386667,1,0
2,119434.0,1,822.5,747.386667,3,3
3,161866.0,1,688.82243,747.386667,1,3
4,83762.0,1,784.271186,747.386667,6,1


##Model 1 - Processing Time Estimator (Regression)

In [4]:
# 1. Prepare Targets for Regression
y_time = df['PROCESSING_TIME_DAYS']

# 2. Split Data (80% Train, 20% Test)
X_train_t, X_test_t, y_train_t, y_test_t = train_test_split(X, y_time, test_size=0.2, random_state=42)

# 3. Train Random Forest Regressor
print("Training Time Estimator Model... (This may take a moment)")
reg_model = RandomForestRegressor(n_estimators=100, random_state=42)
reg_model.fit(X_train_t, y_train_t)

# 4. Evaluate
predictions_t = reg_model.predict(X_test_t)
mae = mean_absolute_error(y_test_t, predictions_t)

print("-" * 30)
print(f"Model Performance (Regression):")
print(f"Mean Absolute Error: {mae:.2f} days")
print("Interpretation: On average, the model's prediction is off by about this many days.")
print("-" * 30)

# Example Prediction
print(f"Sample Prediction: {predictions_t[0]:.0f} days (Actual: {y_test_t.iloc[0]} days)")

Training Time Estimator Model... (This may take a moment)
------------------------------
Model Performance (Regression):
Mean Absolute Error: 54.42 days
Interpretation: On average, the model's prediction is off by about this many days.
------------------------------
Sample Prediction: 923 days (Actual: 927 days)


##Model 2 - Visa Status Prediction (Classification)

In [5]:
# 1. Prepare Targets for Classification
y_status = df['STATUS_ENC']

# 2. Split Data
X_train_c, X_test_c, y_train_c, y_test_c = train_test_split(X, y_status, test_size=0.2, random_state=42)

# 3. Train Random Forest Classifier
print("Training Status Prediction Model...")
class_model = RandomForestClassifier(n_estimators=100, random_state=42)
class_model.fit(X_train_c, y_train_c)

# 4. Evaluate
predictions_c = class_model.predict(X_test_c)
accuracy = accuracy_score(y_test_c, predictions_c)

print("-" * 30)
print(f"Model Performance (Classification):")
print(f"Accuracy: {accuracy * 100:.2f}%")
print("\nDetailed Report:")
print(classification_report(y_test_c, predictions_c, target_names=le_status.classes_))
print("-" * 30)

Training Status Prediction Model...
------------------------------
Model Performance (Classification):
Accuracy: 78.72%

Detailed Report:
              precision    recall  f1-score   support

   Certified       0.71      0.56      0.63        75
      Denied       0.80      0.89      0.84       150
   Withdrawn       1.00      1.00      1.00        10

    accuracy                           0.79       235
   macro avg       0.84      0.82      0.82       235
weighted avg       0.78      0.79      0.78       235

------------------------------


##Save Models for Deployment

In [6]:
# Create a folder to store models
model_dir = '/content/drive/MyDrive/Models'
os.makedirs(model_dir, exist_ok=True)

# Save the Models
joblib.dump(reg_model, f'{model_dir}/visa_time_model.pkl')
joblib.dump(class_model, f'{model_dir}/visa_status_model.pkl')

# Save the Encoders (We need these to decode user input later!)
joblib.dump(le_education, f'{model_dir}/education_encoder.pkl')
joblib.dump(le_skill, f'{model_dir}/skill_encoder.pkl')
joblib.dump(le_status, f'{model_dir}/status_encoder.pkl')

print("Success! Models and Encoders saved to Google Drive.")
print(f"Location: {model_dir}")

Success! Models and Encoders saved to Google Drive.
Location: /content/drive/MyDrive/Models
