In [10]:
import pandas as pd
import numpy as np
from faker import Faker
import random

# Initializing Faker to create some test data since I don't have the official CSV yet
data_gen = Faker()

# --- STEP 1: DATA COLLECTION ---
# Creating a function to build a dataset that mimics real visa records
def get_visa_dataset(records=1500):
    visa_list = []
    types = ['H1-B', 'L1', 'F1-OPT', 'E2', 'H4-EAD']
    sectors = ['IT', 'Healthcare', 'Finance', 'Education']

    for i in range(records):
        # Picking a random date from the last few years
        date_applied = data_gen.date_between(start_date='-3y', end_date='-1y')

        # Adding a random delay for processing (90 to 200 days seems realistic)
        wait_time = random.randint(90, 200)
        date_finished = date_applied + pd.Timedelta(days=wait_time)

        visa_list.append({
            'Case_ID': f"V-{random.randint(1000, 9999)}",
            'Visa_Class': random.choice(types),
            'Sector': random.choice(sectors),
            'Annual_Wage': random.randint(55000, 155000),
            'Apply_Date': date_applied,
            'Decision_Date': date_finished,
            'Status': random.choices(['Certified', 'Denied'], weights=[0.85, 0.15])[0]
        })
    return pd.DataFrame(visa_list)

# Generating the raw dataframe
raw_data = get_visa_dataset(1500)




# --- STEP 2 is: PREPROCESSING  ---

# A. Generating Target Labels (Processing Time)

# We need to find the number of days it took for each visa.
# Subtracting the dates gives a 'timedelta' object, so I use .dt.days to get an integer.
raw_data['Apply_Date'] = pd.to_datetime(raw_data['Apply_Date'])
raw_data['Decision_Date'] = pd.to_datetime(raw_data['Decision_Date'])
raw_data['Wait_Days'] = (raw_data['Decision_Date'] - raw_data['Apply_Date']).dt.days

# B. Encoding Categorical Data
# The model won't understand "Certified" vs "Denied", so I'm mapping them to 1 and 0.
raw_data['Status_Numeric'] = raw_data['Status'].map({'Certified': 1, 'Denied': 0})

# Using get_dummies for Visa_Class and Sector.
# I'm using drop_first = True to avoid the redundant columns (dummy variable trap).
processed_df = pd.get_dummies(raw_data, columns=['Visa_Class', 'Sector'], drop_first=True)

# C. Dropping columns that won't help the AI
# IDs and raw date strings are useless for training. so I'm removing them now.
final_data = processed_df.drop(['Case_ID', 'Apply_Date', 'Decision_Date', 'Status'], axis=1)


# --- STEP 3 is: VERIFICATION ---
# Making sure there are no NaN values left behind
print("Check for Missing Values:")
print(final_data.isnull().sum())

# Checking the new structure
print("\nFinal Dataset Shape:", final_data.shape)
final_data.head()

Check for Missing Values:
Annual_Wage          0
Wait_Days            0
Status_Numeric       0
Visa_Class_F1-OPT    0
Visa_Class_H1-B      0
Visa_Class_H4-EAD    0
Visa_Class_L1        0
Sector_Finance       0
Sector_Healthcare    0
Sector_IT            0
dtype: int64

Final Dataset Shape: (1500, 10)


Unnamed: 0,Annual_Wage,Wait_Days,Status_Numeric,Visa_Class_F1-OPT,Visa_Class_H1-B,Visa_Class_H4-EAD,Visa_Class_L1,Sector_Finance,Sector_Healthcare,Sector_IT
0,71295,191,1,False,False,True,False,False,False,False
1,55667,190,1,False,False,False,True,False,True,False
2,81975,166,1,False,False,True,False,False,True,False
3,114082,123,1,False,True,False,False,False,True,False
4,132965,187,1,False,False,False,True,True,False,False
