In [137]:
#Import libraries
import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder, OneHotEncoder, StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer

## Milestone 1 - Data Collection & Preprocessing

In [65]:
#Load Dataset
df = pd.read_csv("visa_dataset.csv")

In [66]:
df.shape

(50000, 11)

In [67]:
df

Unnamed: 0,Application Date,Decision Date,Visa Type,Applicant Nationality,Processing Center,Season,Application Complexity,Document Completeness,Expedited Request,Processing Time (Days),Visa Status
0,2024-01-21,2024-03-10,Student,Brazil,Paris,Peak,0.0,,0.0,49,Approved
1,2024-10-20,2024-11-01,Business,Australia,Beijing,Off-Peak,0.0,1.0,1.0,12,Approved
2,2024-07-06,2024-08-27,Business,India,Delhi,Peak,0.0,0.0,1.0,52,Approved
3,2024-11-24,2025-01-27,Business,India,Beijing,Peak,0.0,0.0,0.0,64,Approved
4,2024-09-20,2024-12-18,Work,France,Mumbai,Off-Peak,1.0,0.0,1.0,89,Approved
...,...,...,...,...,...,...,...,...,...,...,...
49995,2024-07-04,2024-08-07,Business,Brazil,Sydney,Off-Peak,0.0,0.0,1.0,34,Approved
49996,2024-07-05,2024-08-11,Business,Australia,Mumbai,Peak,0.0,1.0,0.0,37,Approved
49997,2024-11-28,2025-02-24,Work,China,Berlin,Peak,,1.0,0.0,88,Approved
49998,2024-04-03,2024-04-22,Tourist,Australia,Berlin,Off-Peak,,0.0,1.0,19,Approved


### Date transformation

In [68]:
# Convert to datetime if needed
df["Application Date"] = pd.to_datetime(df["Application Date"], errors="coerce")
df["Decision Date"] = pd.to_datetime(df["Decision Date"], errors="coerce")

# Extract useful ML features
df["Application_Month"] = df["Application Date"].dt.month
df["Application_DayOfWeek"] = df["Application Date"].dt.dayofweek
df["Application_WeekOfYear"] = df["Application Date"].dt.isocalendar().week.astype(int)

df["Decision_Month"] = df["Decision Date"].dt.month
df["Decision_DayOfWeek"] = df["Decision Date"].dt.dayofweek
df["Decision_WeekOfYear"] = df["Decision Date"].dt.isocalendar().week.astype(int)

# Drop original date columns
df.drop(columns=["Application Date", "Decision Date"], inplace=True)

In [69]:
df

Unnamed: 0,Visa Type,Applicant Nationality,Processing Center,Season,Application Complexity,Document Completeness,Expedited Request,Processing Time (Days),Visa Status,Application_Month,Application_DayOfWeek,Application_WeekOfYear,Decision_Month,Decision_DayOfWeek,Decision_WeekOfYear
0,Student,Brazil,Paris,Peak,0.0,,0.0,49,Approved,1,6,3,3,6,10
1,Business,Australia,Beijing,Off-Peak,0.0,1.0,1.0,12,Approved,10,6,42,11,4,44
2,Business,India,Delhi,Peak,0.0,0.0,1.0,52,Approved,7,5,27,8,1,35
3,Business,India,Beijing,Peak,0.0,0.0,0.0,64,Approved,11,6,47,1,0,5
4,Work,France,Mumbai,Off-Peak,1.0,0.0,1.0,89,Approved,9,4,38,12,2,51
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
49995,Business,Brazil,Sydney,Off-Peak,0.0,0.0,1.0,34,Approved,7,3,27,8,2,32
49996,Business,Australia,Mumbai,Peak,0.0,1.0,0.0,37,Approved,7,4,27,8,6,32
49997,Work,China,Berlin,Peak,,1.0,0.0,88,Approved,11,3,48,2,0,9
49998,Tourist,Australia,Berlin,Off-Peak,,0.0,1.0,19,Approved,4,2,14,4,0,17


### Defining Target Variables

In [70]:
y_reg = df["Processing Time (Days)"]      # Regression target
y_clf = df["Visa Status"]                 # Classification target

In [71]:
# Drop targets from the feature dataset
X = df.drop(columns=["Processing Time (Days)", "Visa Status"])

### Identify Column Types

In [89]:
categorical_cols = X.select_dtypes(include=["object"]).columns.tolist()
numeric_cols = X.select_dtypes(include=["int64", "float64"]).columns.tolist()

In [108]:
numeric_cols = [
    "Application_Month",
    "Application_DayOfWeek",
    "Application_WeekOfYear",
    "Decision_Month",
    "Decision_DayOfWeek",
    "Decision_WeekOfYear",
    "Application Complexity",
    "Document Completeness",
    "Expedited Request"
]

for col in numeric_cols:
    df[col] = pd.to_numeric(df[col], errors="coerce")


In [109]:
print("Categorical Columns: ",categorical_cols)
print("Numeric Columns: ",numeric_cols)

Categorical Columns:  ['Visa Type', 'Applicant Nationality', 'Processing Center', 'Season']
Numeric Columns:  ['Application_Month', 'Application_DayOfWeek', 'Application_WeekOfYear', 'Decision_Month', 'Decision_DayOfWeek', 'Decision_WeekOfYear', 'Application Complexity', 'Document Completeness', 'Expedited Request']


### Preprocessing Pipeline

In [123]:
# For Numeric columns, Filling missing values and Scaling
numeric_transformer = Pipeline(steps=[
    ("imputer", SimpleImputer(strategy="median")),
    ("scaler", StandardScaler())
])

In [124]:
# For Catrgorical columns, Filling missing values and One hot encoding
categorical_transformer = Pipeline(steps=[
    ("imputer", SimpleImputer(strategy="most_frequent")),
    ("encoder", OneHotEncoder(handle_unknown="ignore"))
])


In [125]:
#Combining pre-processing for numerical and categorical into single transformer
preprocessor = ColumnTransformer(
    transformers=[
        ("num", numeric_transformer, numeric_cols),
        ("cat", categorical_transformer, categorical_cols)
    ]
)

In [126]:
#Train test split
X_train_reg, X_test_reg, y_train_reg, y_test_reg = train_test_split(
    X, y_reg, test_size=0.2, random_state=42
)

X_train_clf, X_test_clf, y_train_clf, y_test_clf = train_test_split(
    X, y_clf, test_size=0.2, random_state=42
)

In [127]:
# Fit Preprocessor
preprocessor.fit(X_train_reg)

In [128]:
#Transform datasets
X_train_reg_processed = preprocessor.transform(X_train_reg)
X_test_reg_processed = preprocessor.transform(X_test_reg)

X_train_clf_processed = preprocessor.transform(X_train_clf)
X_test_clf_processed = preprocessor.transform(X_test_clf)

### Saving pre-processed data

In [129]:
import joblib
joblib.dump(preprocessor, "visa_preprocessor.pkl")

print("Preprocessing completed successfully.")
print("Processed regression shape:", X_train_reg_processed.shape)
print("Processed classification shape:", X_train_clf_processed.shape)

Preprocessing completed successfully.
Processed regression shape: (40000, 30)
Processed classification shape: (40000, 30)


### Checking if Pre-processing is success

In [131]:
#Prints only numbers (scaled and encoded)
print(X_train_reg_processed[:5])

[[-0.72201138 -1.48680634 -0.68917466 -0.15276596 -1.50140797 -0.23149611
   1.11221012 -1.11361725  0.89856522  1.          0.          0.
   0.          0.          0.          0.          0.          1.
   0.          0.          0.          0.          0.          0.
   0.          0.          0.          1.          0.          1.        ]
 [-0.14044179  1.00499218 -0.02342121  0.42914417 -1.00098869  0.50271293
  -0.89911068 -1.11361725  0.89856522  0.          1.          0.
   0.          0.          0.          1.          0.          0.
   0.          0.          0.          0.          0.          0.
   1.          0.          0.          0.          0.          1.        ]
 [ 0.7319126   1.50335189  0.77548294  1.59296442 -1.00098869  1.63739963
   1.11221012 -1.11361725 -1.11288527  0.          1.          0.
   0.          0.          0.          0.          0.          0.
   0.          1.          0.          0.          0.          1.
   0.          0.          0.     

In [132]:
#Print number of features before and after
print("Before:", X_train_reg.shape[1])
print("After:", X_train_reg_processed.shape[1])

Before: 13
After: 30


Before: 13 features

After: 26 features (because one-hot encoding expands categories)

In [133]:
#Verify encoder categories
ohe = preprocessor.named_transformers_["cat"]["encoder"]
print(ohe.get_feature_names_out())


['x0_Business' 'x0_Student' 'x0_Tourist' 'x0_Work' 'x1_Australia'
 'x1_Brazil' 'x1_China' 'x1_France' 'x1_Germany' 'x1_India' 'x1_UK'
 'x1_USA' 'x2_Beijing' 'x2_Berlin' 'x2_Delhi' 'x2_London' 'x2_Mumbai'
 'x2_Paris' 'x2_Sydney' 'x3_Off-Peak' 'x3_Peak']


### Name is of the form:

VisaType_Student

VisaType_Tourist

ApplicantNationality_India

ProcessingCenter_Berlin

Season_Peak

### Hence, ecoding is done right

In [135]:
# Check if missing values are gone
print(pd.DataFrame(X_train_reg_processed).isna().sum().sum())

0


In [136]:
#Check if scaling applied
print(X_train_reg["Application_Month"].head())

numeric_col_index = numeric_cols.index("Application_Month")
print("Index inside numeric pipeline:", numeric_col_index)
print(X_train_reg_processed[:5, numeric_col_index])


39087     4
30893     6
45278     9
16398    11
13653     1
Name: Application_Month, dtype: int32
Index inside numeric pipeline: 0
[-0.72201138 -0.14044179  0.7319126   1.31348219 -1.59436577]


Mean = 0 and Standard deviation = 1

This confirms that scalng is applied

### Preprocessing completed and checked Successfully