In [27]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns 
import sklearn
import dvc
import mlflow

from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import (
    OneHotEncoder,
    StandardScaler,
    LabelEncoder,
    MinMaxScaler,
)
from sklearn.model_selection import train_test_split    
from sklearn.pipeline import (
    make_pipeline,
    Pipeline
)
from sklearn.metrics import (
    accuracy_score,
    precision_score,
    f1_score,
    recall_score
)
from sklearn.compose import ColumnTransformer


In [4]:
df = pd.read_csv("Visadataset.csv")

In [5]:
df.sample(4)

Unnamed: 0,case_id,continent,education_of_employee,has_job_experience,requires_job_training,no_of_employees,yr_of_estab,region_of_employment,prevailing_wage,unit_of_wage,full_time_position,case_status
15483,EZYV15484,Asia,Bachelor's,N,N,1767,2010,West,135852.65,Year,N,Denied
13838,EZYV13839,Europe,Doctorate,Y,N,101293,1999,South,10705.54,Year,Y,Certified
24140,EZYV24141,Asia,Master's,N,N,1011,1994,West,138708.21,Year,Y,Denied
21353,EZYV21354,Asia,High School,Y,Y,3222,1999,West,139302.76,Year,Y,Certified


In [6]:
df.isnull().sum()

case_id                  0
continent                0
education_of_employee    0
has_job_experience       0
requires_job_training    0
no_of_employees          0
yr_of_estab              0
region_of_employment     0
prevailing_wage          0
unit_of_wage             0
full_time_position       0
case_status              0
dtype: int64

In [7]:
df.duplicated().sum()

np.int64(0)

In [8]:
df.shape

(25480, 12)

In [9]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 25480 entries, 0 to 25479
Data columns (total 12 columns):
 #   Column                 Non-Null Count  Dtype  
---  ------                 --------------  -----  
 0   case_id                25480 non-null  object 
 1   continent              25480 non-null  object 
 2   education_of_employee  25480 non-null  object 
 3   has_job_experience     25480 non-null  object 
 4   requires_job_training  25480 non-null  object 
 5   no_of_employees        25480 non-null  int64  
 6   yr_of_estab            25480 non-null  int64  
 7   region_of_employment   25480 non-null  object 
 8   prevailing_wage        25480 non-null  float64
 9   unit_of_wage           25480 non-null  object 
 10  full_time_position     25480 non-null  object 
 11  case_status            25480 non-null  object 
dtypes: float64(1), int64(2), object(9)
memory usage: 2.3+ MB


In [10]:
df["case_id"].value_counts()

case_id
EZYV25480    1
EZYV01       1
EZYV02       1
EZYV03       1
EZYV04       1
            ..
EZYV13       1
EZYV12       1
EZYV11       1
EZYV10       1
EZYV09       1
Name: count, Length: 25480, dtype: int64

In [11]:
df=df.drop('case_id', axis=1)

In [12]:
df['education_of_employee'].value_counts()  # 

education_of_employee
Bachelor's     10234
Master's        9634
High School     3420
Doctorate       2192
Name: count, dtype: int64

In [13]:
df['education_of_employee'] = df['education_of_employee'].replace({
    "Bachelor's" : 1,
    "Master's": 2,
    "High School": 3,
    "Doctorate": 4
})
df['education_of_employee'] = df['education_of_employee'].astype(int)

  df['education_of_employee'] = df['education_of_employee'].replace({


In [14]:
df["continent"].value_counts()

continent
Asia             16861
Europe            3732
North America     3292
South America      852
Africa             551
Oceania            192
Name: count, dtype: int64

In [15]:
df['has_job_experience'].value_counts()

has_job_experience
Y    14802
N    10678
Name: count, dtype: int64

In [16]:
df['requires_job_training'].value_counts()

requires_job_training
N    22525
Y     2955
Name: count, dtype: int64

In [17]:
df['region_of_employment'].value_counts()

region_of_employment
Northeast    7195
South        7017
West         6586
Midwest      4307
Island        375
Name: count, dtype: int64

In [18]:
df['unit_of_wage'].value_counts()  #

unit_of_wage
Year     22962
Hour      2157
Week       272
Month       89
Name: count, dtype: int64

In [19]:
df['unit_of_wage'] = df['unit_of_wage'].replace({
    "Year": 4,            
    "Hour": 1,      
    "Week": 2,       
    "Month" :3})
df['unit_of_wage'] = df['unit_of_wage'].astype(int)


  df['unit_of_wage'] = df['unit_of_wage'].replace({


In [20]:
df['full_time_position'].value_counts()

full_time_position
Y    22773
N     2707
Name: count, dtype: int64

In [21]:
df['case_status'].value_counts()       #

case_status
Certified    17018
Denied        8462
Name: count, dtype: int64

In [22]:
df['case_status'] = df['case_status'].replace({
    "Certified": 1,
    "Denied": 0
})
df['case_status'] = df['case_status'].astype(int)

  df['case_status'] = df['case_status'].replace({


In [23]:
df

Unnamed: 0,continent,education_of_employee,has_job_experience,requires_job_training,no_of_employees,yr_of_estab,region_of_employment,prevailing_wage,unit_of_wage,full_time_position,case_status
0,Asia,3,N,N,14513,2007,West,592.2029,1,Y,0
1,Asia,2,Y,N,2412,2002,Northeast,83425.6500,4,Y,1
2,Asia,1,N,Y,44444,2008,West,122996.8600,4,Y,0
3,Asia,1,N,N,98,1897,West,83434.0300,4,Y,0
4,Africa,2,Y,N,1082,2005,South,149907.3900,4,Y,1
...,...,...,...,...,...,...,...,...,...,...,...
25475,Asia,1,Y,Y,2601,2008,South,77092.5700,4,Y,1
25476,Asia,3,Y,N,3274,2006,Northeast,279174.7900,4,Y,1
25477,Asia,2,Y,N,1121,1910,South,146298.8500,4,N,1
25478,Asia,2,Y,Y,1918,1887,West,86154.7700,4,Y,1


In [24]:
# case_status, unit_of_wage, education_of_employee
# full_time_position, region_of_employment, requires_job_training, has_job_experience, continent

In [30]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 25480 entries, 0 to 25479
Data columns (total 11 columns):
 #   Column                 Non-Null Count  Dtype  
---  ------                 --------------  -----  
 0   continent              25480 non-null  object 
 1   education_of_employee  25480 non-null  int64  
 2   has_job_experience     25480 non-null  object 
 3   requires_job_training  25480 non-null  object 
 4   no_of_employees        25480 non-null  int64  
 5   yr_of_estab            25480 non-null  int64  
 6   region_of_employment   25480 non-null  object 
 7   prevailing_wage        25480 non-null  float64
 8   unit_of_wage           25480 non-null  int64  
 9   full_time_position     25480 non-null  object 
 10  case_status            25480 non-null  int64  
dtypes: float64(1), int64(5), object(5)
memory usage: 2.1+ MB


In [31]:
preprocess = ColumnTransformer(transformers=[
    ('one', OneHotEncoder(), ['full_time_position', 'region_of_employment', 'requires_job_training', 'has_job_experience', 'continent']),
    ("std", StandardScaler(), ['no_of_employees', 'yr_of_estab', 'prevailing_wage'])
],
   remainder='passthrough'
)

In [51]:

pipeline = Pipeline(steps =[
    ('prpreprocessepro', preprocess),
    ("classi", RandomForestClassifier(
        n_estimators=500,
        max_depth=20,
        min_samples_split=5,
        min_samples_leaf=2,
        max_features='sqrt',
        class_weight='balanced',
        random_state=42))
])

In [52]:
train_data, test_data=train_test_split(df, random_state=42, test_size=0.2)

In [53]:
X_train = train_data.drop(columns=['case_status'])
y_train = train_data['case_status']

X_test = test_data.drop(columns=['case_status'])
y_test = test_data['case_status']

In [54]:
pipeline.fit(X_train, y_train)

The format of the columns of the 'remainder' transformer in ColumnTransformer.transformers_ will change in version 1.7 to match the format of the other transformers.
At the moment the remainder columns are stored as indices (of type int). With the same ColumnTransformer configuration, in the future they will be stored as column names (of type str).



In [55]:

y_pred = pipeline.predict(X_test)

In [56]:
acc = accuracy_score(y_test,y_pred)
pre = precision_score(y_test, y_pred)
recall = recall_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred)

In [57]:
print("acc:->",acc)
print('pre:->', pre)
print('recall:->', recall)
print('f1 :->', f1)

acc:-> 0.7343014128728415
pre:-> 0.8056136160047775
recall:-> 0.7932960893854749
f1 :-> 0.7994074074074075
