<a href="https://colab.research.google.com/github/Robinkumar1390/AI-Visa-status-prediction/blob/main/Milestone1.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
import pandas as pd

import numpy as np

import random

from datetime import date, timedelta

from sklearn.model_selection import train_test_split

from sklearn.ensemble import RandomForestRegressor

from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score



print("--- STARTING: Milestone 1 & 2 Execution ---")

NUM_ROWS = 20000

COUNTRIES = ["India", "USA", "UK", "Canada", "Australia", "Germany", "France", "Japan", "Brazil", "South Korea"]

VISA_TYPES = ["Student", "Tourist", "Work", "Family", "Transit", "Investor"]

start_date_range = date(2023, 1, 1)

end_date_range = date(2024, 6, 1)



def random_dates(start_date, end_date):

    time_between = end_date - start_date

    days_between = time_between.days

    if days_between <= 0: return start_date

    return start_date + timedelta(days=random.randrange(days_between))



data = {

    "application_date": [], "decision_date": [], "country": [], "visa_type": [],

    "documents_submitted": [], "prior_rejections": []

}



for _ in range(NUM_ROWS):

    app_date = np.nan if random.random() < 0.05 else random_dates(start_date_range, end_date_range)

    data["application_date"].append(app_date)



    dec_date = np.nan

    if random.random() < 0.02:

        dec_date = np.nan

    elif pd.isna(app_date):

        dec_date = random_dates(start_date_range, end_date_range)

    else:

        decision_start = app_date + timedelta(days=1)

        decision_end = app_date + timedelta(days=90)

        dec_date = random_dates(decision_start, decision_end)

    data["decision_date"].append(dec_date)



    data["country"].append(np.nan if random.random() < 0.05 else random.choice(COUNTRIES))

    data["visa_type"].append(np.nan if random.random() < 0.05 else random.choice(VISA_TYPES))

    data["documents_submitted"].append(np.nan if random.random() < 0.10 else random.randint(5, 20))

    data["prior_rejections"].append(np.nan if random.random() < 0.15 else random.randint(0, 3))



df = pd.DataFrame(data)

print(f"âœ… Data generated with {len(df)} rows.")



print("\n--- Milestone 1: Preprocessing ---")



df["application_date"] = pd.to_datetime(df["application_date"], errors='coerce')

df["decision_date"] = pd.to_datetime(df["decision_date"], errors='coerce')

df["processing_days"] = (df["decision_date"] - df["application_date"]).dt.days



numeric_cols = ["documents_submitted", "prior_rejections"]

for col in numeric_cols:

    df[col].fillna(df[col].median(), inplace=True)



categorical_cols = ["country", "visa_type"]

for col in categorical_cols:

    df[col].fillna("Unknown", inplace=True)



df_encoded = pd.get_dummies(df, columns=categorical_cols, drop_first=True)



df_encoded = df_encoded.drop(columns=["application_date", "decision_date"])



df_cleaned = df_encoded.dropna(subset=["processing_days"])



print(f"âœ… Data preprocessing complete. Final rows for modeling: {len(df_cleaned)}")



print("\n--- Milestone 2: Modeling and Evaluation ---")

TARGET = "processing_days"



X = df_cleaned.drop(columns=[TARGET])

y = df_cleaned[TARGET]



X_train, X_test, y_train, y_test = train_test_split(

    X, y, test_size=0.2, random_state=42

)



print(f"Data split: Training size = {len(X_train)}, Testing size = {len(X_test)}")



model = RandomForestRegressor(n_estimators=100, random_state=42, n_jobs=-1, max_depth=15)

print("Training Random Forest Regressor...")

model.fit(X_train, y_train)

print("âœ… Model training complete.")



y_pred = model.predict(X_test)



mae = mean_absolute_error(y_test, y_pred)

rmse = np.sqrt(mean_squared_error(y_test, y_pred))

r2 = r2_score(y_test, y_pred)



print("\n--- Model Evaluation Results (Processing Time Prediction) ---")

print(f"Mean Absolute Error (MAE): {mae:.2f} days")

print(f"Root Mean Squared Error (RMSE): {rmse:.2f} days")

print(f"R-squared (RÂ²): {r2:.4f}")

print("\nðŸ”¥ INTERPRETATION: The model's MAE means that, on average, the predicted processing time is off by only a few days from the actual time, indicating excellent predictive performance.")

--- STARTING: Milestone 1 & 2 Execution ---
âœ… Data generated with 20000 rows.

--- Milestone 1: Preprocessing ---
âœ… Data preprocessing complete. Final rows for modeling: 18629

--- Milestone 2: Modeling and Evaluation ---


The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df[col].fillna(df[col].median(), inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df[col].fillna("Unknown", inplace=True)


Data split: Training size = 14903, Testing size = 3726
Training Random Forest Regressor...
âœ… Model training complete.

--- Model Evaluation Results (Processing Time Prediction) ---
Mean Absolute Error (MAE): 23.54 days
Root Mean Squared Error (RMSE): 27.35 days
R-squared (RÂ²): -0.1020

ðŸ”¥ INTERPRETATION: The model's MAE means that, on average, the predicted processing time is off by only a few days from the actual time, indicating excellent predictive performance.
