## Import libraries

In [1]:
import numpy as np
import pandas as pd 

import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.model_selection import train_test_split
from sklearn.feature_extraction import DictVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score

## Loading and Overviewing of Dataset

In [2]:
data = pd.read_csv("Dataset/EasyVisa.csv")
## 
df = data.copy()

In [3]:
df.head()

Unnamed: 0,case_id,continent,education_of_employee,has_job_experience,requires_job_training,no_of_employees,yr_of_estab,region_of_employment,prevailing_wage,unit_of_wage,full_time_position,case_status
0,EZYV01,Asia,High School,N,N,14513,2007,West,592.2029,Hour,Y,Denied
1,EZYV02,Asia,Master's,Y,N,2412,2002,Northeast,83425.65,Year,Y,Certified
2,EZYV03,Asia,Bachelor's,N,Y,44444,2008,West,122996.86,Year,Y,Denied
3,EZYV04,Asia,Bachelor's,N,N,98,1897,West,83434.03,Year,Y,Denied
4,EZYV05,Africa,Master's,Y,N,1082,2005,South,149907.39,Year,Y,Certified


In [4]:
df.tail()

Unnamed: 0,case_id,continent,education_of_employee,has_job_experience,requires_job_training,no_of_employees,yr_of_estab,region_of_employment,prevailing_wage,unit_of_wage,full_time_position,case_status
25475,EZYV25476,Asia,Bachelor's,Y,Y,2601,2008,South,77092.57,Year,Y,Certified
25476,EZYV25477,Asia,High School,Y,N,3274,2006,Northeast,279174.79,Year,Y,Certified
25477,EZYV25478,Asia,Master's,Y,N,1121,1910,South,146298.85,Year,N,Certified
25478,EZYV25479,Asia,Master's,Y,Y,1918,1887,West,86154.77,Year,Y,Certified
25479,EZYV25480,Asia,Bachelor's,Y,N,3195,1960,Midwest,70876.91,Year,Y,Certified


In [5]:
print(f'total number of rows: {df.shape[0]} => total number of columns: {df.shape[1]}')

total number of rows: 25480 => total number of columns: 12


In [6]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 25480 entries, 0 to 25479
Data columns (total 12 columns):
 #   Column                 Non-Null Count  Dtype  
---  ------                 --------------  -----  
 0   case_id                25480 non-null  object 
 1   continent              25480 non-null  object 
 2   education_of_employee  25480 non-null  object 
 3   has_job_experience     25480 non-null  object 
 4   requires_job_training  25480 non-null  object 
 5   no_of_employees        25480 non-null  int64  
 6   yr_of_estab            25480 non-null  int64  
 7   region_of_employment   25480 non-null  object 
 8   prevailing_wage        25480 non-null  float64
 9   unit_of_wage           25480 non-null  object 
 10  full_time_position     25480 non-null  object 
 11  case_status            25480 non-null  object 
dtypes: float64(1), int64(2), object(9)
memory usage: 2.3+ MB


In [7]:
df.dtypes

case_id                   object
continent                 object
education_of_employee     object
has_job_experience        object
requires_job_training     object
no_of_employees            int64
yr_of_estab                int64
region_of_employment      object
prevailing_wage          float64
unit_of_wage              object
full_time_position        object
case_status               object
dtype: object

In [8]:
df.isnull().sum()

case_id                  0
continent                0
education_of_employee    0
has_job_experience       0
requires_job_training    0
no_of_employees          0
yr_of_estab              0
region_of_employment     0
prevailing_wage          0
unit_of_wage             0
full_time_position       0
case_status              0
dtype: int64

In [9]:
for col in df:
  print(df[col].unique())

['EZYV01' 'EZYV02' 'EZYV03' ... 'EZYV25478' 'EZYV25479' 'EZYV25480']
['Asia' 'Africa' 'North America' 'Europe' 'South America' 'Oceania']
['High School' "Master's" "Bachelor's" 'Doctorate']
['N' 'Y']
['N' 'Y']
[14513  2412 44444 ... 24335 48785 40224]
[2007 2002 2008 1897 2005 2012 1994 1924 1995 2004 1963 2006 1987 1991
 2001 1972 2013 1968 1884 1981 1997 2009 1998 1880 2000 2010 1965 1909
 2011 1989 1933 1960 2003 1976 1996 1847 1935 1890 1999 1838 1947 1939
 1970 1977 1982 1943 1956 1864 1974 1985 1984 1971 1913 1969 1818 1839
 1914 1988 1944 1855 1975 1966 1801 1920 1925 1993 1992 1979 1986 1931
 1962 1954 1868 1859 1946 1950 1869 1917 2014 1980 1896 1949 1843 1850
 1906 1961 1951 1958 1912 1983 1948 1945 1978 1898 1923 1911 1851 1865
 1849 1872 1967 1926 1873 1848 2015 1889 1876 1852 1938 1973 1959 1927
 1990 1930 1922 1940 1878 1934 1834 1928 1800 1861 1952 1846 1885 1907
 1817 1841 1821 1953 1888 1916 1886 2016 1932 1915 1937 1866 1919 1921
 1910 1854 1875 1904 1879 1942 1964 18

## Data Preprocessing - Step 1
- Drop the case_id column
- Change the column type for "has_job_experience", "requires_job_training","full_time_position","case_status" to category

In [10]:
df = df.drop(['case_id'], axis=1)

In [11]:
## convert the has_job_experience column type to category
## df['has_job_experience'] = df['has_job_experience'].astype('category')

In [12]:
## convert the requires_job_training column type to category
## df['requires_job_training'] = df['requires_job_training'].astype('category')

In [13]:
## convert the full_time_position column type to category
## df['full_time_position'] = df['full_time_position'].astype('category')

In [14]:
## convert the case_status column type to category
## df['case_status'] = df['case_status'].astype('category')

In [15]:
df.dtypes

continent                 object
education_of_employee     object
has_job_experience        object
requires_job_training     object
no_of_employees            int64
yr_of_estab                int64
region_of_employment      object
prevailing_wage          float64
unit_of_wage              object
full_time_position        object
case_status               object
dtype: object

In [16]:
df.case_status.value_counts()

case_status
Certified    17018
Denied        8462
Name: count, dtype: int64

In [17]:
df.case_status.head()

0       Denied
1    Certified
2       Denied
3       Denied
4    Certified
Name: case_status, dtype: object

In [18]:
df.case_status = (df.case_status == 'Certified').astype(int)

In [19]:
df.case_status.head()

0    0
1    1
2    0
3    0
4    1
Name: case_status, dtype: int32

In [20]:
df.head()

Unnamed: 0,continent,education_of_employee,has_job_experience,requires_job_training,no_of_employees,yr_of_estab,region_of_employment,prevailing_wage,unit_of_wage,full_time_position,case_status
0,Asia,High School,N,N,14513,2007,West,592.2029,Hour,Y,0
1,Asia,Master's,Y,N,2412,2002,Northeast,83425.65,Year,Y,1
2,Asia,Bachelor's,N,Y,44444,2008,West,122996.86,Year,Y,0
3,Asia,Bachelor's,N,N,98,1897,West,83434.03,Year,Y,0
4,Africa,Master's,Y,N,1082,2005,South,149907.39,Year,Y,1


## Exploratory Data Analysis

In [21]:
## target variable 
df.case_status.value_counts()

case_status
1    17018
0     8462
Name: count, dtype: int64

In [22]:
numeric_cols = df.select_dtypes(exclude=[object])

corr_matrix = numeric_cols.corr()

corr_matrix['case_status']

no_of_employees    0.008677
yr_of_estab        0.008597
prevailing_wage    0.076198
case_status        1.000000
Name: case_status, dtype: float64

## Build a Validation Framework


In [23]:
## 
df_train_full , df_test = train_test_split(df, test_size=0.2, random_state=11) 
df_train, df_valid = train_test_split(df_train_full, test_size=0.25, random_state=11)


print(f'Training dataset: {len(df_train)}')
print(f'Validation dataset: {len(df_valid)}')
print(f'Test dataset: {len(df_test)}')

Training dataset: 15288
Validation dataset: 5096
Test dataset: 5096


In [24]:
y_train = df_train['case_status'].values
y_valid = df_valid['case_status'].values
y_test = df_test['case_status'].values

In [25]:
del df_train['case_status']
del df_valid['case_status']
del df_test['case_status']

In [28]:
df.dtypes

continent                 object
education_of_employee     object
has_job_experience        object
requires_job_training     object
no_of_employees            int64
yr_of_estab                int64
region_of_employment      object
prevailing_wage          float64
unit_of_wage              object
full_time_position        object
case_status                int32
dtype: object

## Feature Engineering 
- Dividing our data into numerical and categorical
- perform the one-hot encoding

In [29]:
numerical_features = ['no_of_employees','yr_of_estab','prevailing_wage']

categorical_features = ['continent','education_of_employee','has_job_experience','requires_job_training','region_of_employment','unit_of_wage','full_time_position']

In [30]:
## convert the dataframe into dict
train_dict = df_train[categorical_features + numerical_features].to_dict(orient='records')

valid_dict = df_valid[categorical_features + numerical_features].to_dict(orient='records')

In [31]:
dv = DictVectorizer(sparse=False)
dv.fit(train_dict)

In [32]:
X_train = dv.transform(train_dict)

X_valid = dv.transform(valid_dict)

## Training The Model

In [33]:
model = LogisticRegression(solver='liblinear', random_state=1)
model.fit(X_train, y_train)

In [34]:
y_valid_pred = model.predict_proba(X_valid)

In [35]:
y_valid_pred

array([[0.33966122, 0.66033878],
       [0.37088715, 0.62911285],
       [0.38524135, 0.61475865],
       ...,
       [0.28108691, 0.71891309],
       [0.38740448, 0.61259552],
       [0.28483983, 0.71516017]])

In [36]:
y_valid_pred = model.predict_proba(X_valid)[:, 1]

In [37]:
case_status = y_valid_pred >= 0.5

In [38]:
(y_valid == case_status).mean()

0.6660125588697017

In [39]:
acc_score = accuracy_score(y_valid, case_status)
print(f'Validation Accuracy Score: {round(acc_score * 100, 1)}%')

Validation Accuracy Score: 66.6%


## Saving The Model

In [40]:
import pickle

In [41]:
## specifyging where to save the file
with open('visa-approval-model.bin', 'wb') as f_out:
    ## save the model
    pickle.dump((dv,model), f_out)

## Loading The Model

In [42]:
with open('visa-approval-model.bin', 'rb') as f_in:
    dv, model = pickle.load(f_in)

In [46]:
## patient Data here
applicant = {
 'case_id': 0,
 'continent': 'Asia',
 'education_of_employee': 'High School',
 'has_job_experience': 'N',
 'requires_job_training': 'N',
 'no_of_employees': 14513,
 'yr_of_estab': 2007,    
 'region_of_employment': 'West',
 'prevailing_wage': 592.2029,
 'unit_of_wage': 'Hour',
 'full_time_position': 'Y',    
}

In [47]:
def predict_single(df, dv, model):
    X = dv.transform([applicant])
    y_pred = model.predict_proba(X)[:,1]
    return y_pred[0]

In [48]:
prediction = predict_single(applicant, dv, model)

In [49]:
print(f'{prediction}')

0.6179479925086814


In [50]:
if prediction >= 0.5:
    print('verdict: Approve')
else:
    print('verdict: Reject')

verdict: Approve
